blob: 9de1bc33e9cde1e633b84b71773a44d400124e5a [file] [log] [blame]
/*
* High-speed vectorize FFT code for arbitrary `logn`.
*
* =============================================================================
* Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
* ECE Department, George Mason University
* Fairfax, VA, U.S.A.
* Author: Duc Tri Nguyen
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================================
* @author Duc Tri Nguyen <[email protected]>, <[email protected]>
*/
#include "inner.h"
#include "macrof.h"
#include "macrofx4.h"
/*
* 1 layer of Forward FFT for 2 complex points (4 coefficients).
* Note: The scalar version is faster than vectorized code.
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log2(fpr *f) {
fpr x_re, x_im, y_re, y_im, v_re, v_im, t_re, t_im, s;
x_re = f[0];
y_re = f[1];
x_im = f[2];
y_im = f[3];
s = fpr_tab_log2[0];
t_re = y_re * s;
t_im = y_im * s;
v_re = t_re - t_im;
v_im = t_re + t_im;
f[0] = x_re + v_re;
f[1] = x_re - v_re;
f[2] = x_im + v_im;
f[3] = x_im - v_im;
}
/*
* Vectorized 2 layers of Forward FFT for 4 complex points (8 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log3(fpr *f) {
// Total SIMD registers: 18 = 4 + 6 + 8
float64x2x4_t tmp; // 4
float64x2x2_t s_re_im, x, y; // 6
float64x2_t v_re, v_im, x_re, x_im, y_re, y_im, t_x, t_y; // 8
vloadx4(tmp, &f[0]);
s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
vfmul(v_re, tmp.val[1], s_re_im.val[0]);
vfmul(v_im, tmp.val[3], s_re_im.val[0]);
vfsub(t_x, v_re, v_im);
vfadd(t_y, v_re, v_im);
vfsub(tmp.val[1], tmp.val[0], t_x);
vfsub(tmp.val[3], tmp.val[2], t_y);
vfadd(tmp.val[0], tmp.val[0], t_x);
vfadd(tmp.val[2], tmp.val[2], t_y);
x_re = vtrn1q_f64(tmp.val[0], tmp.val[1]);
y_re = vtrn2q_f64(tmp.val[0], tmp.val[1]);
x_im = vtrn1q_f64(tmp.val[2], tmp.val[3]);
y_im = vtrn2q_f64(tmp.val[2], tmp.val[3]);
vload2(s_re_im, &fpr_tab_log3[0]);
FWD_TOP(v_re, v_im, y_re, y_im, s_re_im.val[0], s_re_im.val[1]);
FPC_ADD(x.val[0], y.val[0], x_re, x_im, v_re, v_im);
FPC_SUB(x.val[1], y.val[1], x_re, x_im, v_re, v_im);
vstore2(&f[0], x);
vstore2(&f[4], y);
}
/*
* Vectorized 3 layers of Forward FFT for 8 complex points (16 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log4(fpr *f) {
// Total SIMD register: 26 = 8 + 18
float64x2x4_t t0, t1; // 8
float64x2x2_t x_re, x_im, y_re, y_im, v1, v2, tx, ty, s_re_im; // 18
vloadx4(t0, &f[0]);
vloadx4(t1, &f[8]);
vload(s_re_im.val[0], &fpr_tab_log2[0]);
vfmul(v1.val[0], t0.val[2], s_re_im.val[0]);
vfmul(v1.val[1], t0.val[3], s_re_im.val[0]);
vfmul(v2.val[0], t1.val[2], s_re_im.val[0]);
vfmul(v2.val[1], t1.val[3], s_re_im.val[0]);
vfsub(tx.val[0], v1.val[0], v2.val[0]);
vfsub(tx.val[1], v1.val[1], v2.val[1]);
vfadd(ty.val[0], v1.val[0], v2.val[0]);
vfadd(ty.val[1], v1.val[1], v2.val[1]);
FWD_BOT(t0.val[0], t1.val[0], t0.val[2], t1.val[2], tx.val[0], ty.val[0]);
FWD_BOT(t0.val[1], t1.val[1], t0.val[3], t1.val[3], tx.val[1], ty.val[1]);
vload(s_re_im.val[0], &fpr_tab_log3[0]);
FWD_TOP_LANE(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0]);
FWD_TOP_LANE(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0]);
FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
x_re.val[0] = t0.val[0];
x_re.val[1] = t0.val[2];
y_re.val[0] = t0.val[1];
y_re.val[1] = t0.val[3];
x_im.val[0] = t1.val[0];
x_im.val[1] = t1.val[2];
y_im.val[0] = t1.val[1];
y_im.val[1] = t1.val[3];
t0.val[0] = vzip1q_f64(x_re.val[0], x_re.val[1]);
t0.val[1] = vzip2q_f64(x_re.val[0], x_re.val[1]);
t0.val[2] = vzip1q_f64(y_re.val[0], y_re.val[1]);
t0.val[3] = vzip2q_f64(y_re.val[0], y_re.val[1]);
t1.val[0] = vzip1q_f64(x_im.val[0], x_im.val[1]);
t1.val[1] = vzip2q_f64(x_im.val[0], x_im.val[1]);
t1.val[2] = vzip1q_f64(y_im.val[0], y_im.val[1]);
t1.val[3] = vzip2q_f64(y_im.val[0], y_im.val[1]);
vload2(s_re_im, &fpr_tab_log4[0]);
FWD_TOP(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0], s_re_im.val[1]);
FWD_TOP(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0], s_re_im.val[1]);
FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
vstore4(&f[0], t0);
vstore4(&f[8], t1);
}
/*
* Vectorized 4 layers of Forward FFT for 16 complex points (32 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(fpr *f, const unsigned logn) {
// Total SIMD register: 34 = 2 + 32
float64x2x2_t s_re_im; // 2
float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im, v_re, v_im; // 32
const unsigned int falcon_n = 1 << logn;
const unsigned int hn = falcon_n >> 1;
unsigned int level = logn - 3;
const fpr *fpr_tab2 = fpr_table[level++],
*fpr_tab3 = fpr_table[level++],
*fpr_tab4 = fpr_table[level++],
*fpr_tab5 = fpr_table[level];
int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
for (unsigned j = 0; j < hn; j += 16) {
vload(s_re_im.val[0], &fpr_tab2[k2]);
/*
* We only increase k2 when j value has the form j = 32*x + 16
* Modulo 32 both sides, then check if (j % 32) == 16.
*/
k2 += 2 * ((j & 31) == 16);
vloadx4(y_re, &f[j + 8]);
vloadx4(y_im, &f[j + 8 + hn]);
if (logn == 5) {
// Handle special case when use fpr_tab_log2, where re == im
// This reduce number of multiplications,
// although equal number of instructions as the "else" branch
vfmulx4_i(t_im, y_im, s_re_im.val[0]);
vfmulx4_i(t_re, y_re, s_re_im.val[0]);
vfsubx4(v_re, t_re, t_im);
vfaddx4(v_im, t_re, t_im);
} else {
FWD_TOP_LANEx4(v_re, v_im, y_re, y_im, s_re_im.val[0]);
}
vloadx4(x_re, &f[j]);
vloadx4(x_im, &f[j + hn]);
if ((j >> 4) & 1) {
FWD_BOTJx4(x_re, x_im, y_re, y_im, v_re, v_im);
} else {
FWD_BOTx4(x_re, x_im, y_re, y_im, v_re, v_im);
}
vload(s_re_im.val[0], &fpr_tab3[k3]);
k3 += 2;
FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[2], x_im.val[2], s_re_im.val[0]);
FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[2], y_im.val[2], s_re_im.val[0]);
FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0]);
FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0]);
FWD_BOT(x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
FWD_BOTJ(y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2]);
FWD_BOTJ(y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
vloadx2(s_re_im, &fpr_tab4[k4]);
k4 += 4;
FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0]);
FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[1]);
FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[1]);
FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
transpose_f64(x_re, x_re, v_re, 0, 2, 0);
transpose_f64(x_re, x_re, v_re, 1, 3, 1);
transpose_f64(x_im, x_im, v_im, 0, 2, 0);
transpose_f64(x_im, x_im, v_im, 1, 3, 1);
v_re.val[0] = x_re.val[2];
x_re.val[2] = x_re.val[1];
x_re.val[1] = v_re.val[0];
v_im.val[0] = x_im.val[2];
x_im.val[2] = x_im.val[1];
x_im.val[1] = v_im.val[0];
transpose_f64(y_re, y_re, v_re, 0, 2, 2);
transpose_f64(y_re, y_re, v_re, 1, 3, 3);
transpose_f64(y_im, y_im, v_im, 0, 2, 2);
transpose_f64(y_im, y_im, v_im, 1, 3, 3);
v_re.val[0] = y_re.val[2];
y_re.val[2] = y_re.val[1];
y_re.val[1] = v_re.val[0];
v_im.val[0] = y_im.val[2];
y_im.val[2] = y_im.val[1];
y_im.val[1] = v_im.val[0];
vload2(s_re_im, &fpr_tab5[k5]);
k5 += 4;
FWD_TOP(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0], s_re_im.val[1]);
FWD_TOP(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0], s_re_im.val[1]);
vload2(s_re_im, &fpr_tab5[k5]);
k5 += 4;
FWD_TOP(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[0], s_re_im.val[1]);
FWD_TOP(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0], s_re_im.val[1]);
FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
vstore4(&f[j], x_re);
vstore4(&f[j + hn], x_im);
FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
vstore4(&f[j + 8], y_re);
vstore4(&f[j + 8 + hn], y_im);
}
}
/*
* Vectorized 1 layer of Forward FFT for 16 complex points (32 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn1(fpr *f, const unsigned logn) {
const unsigned n = 1 << logn;
const unsigned hn = n >> 1;
const unsigned ht = n >> 2;
// Total SIMD register: 25 = 1 + 24
float64x2_t s_re_im; // 1
float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im, v_re, v_im; // 24
s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
for (unsigned j = 0; j < ht; j += 8) {
vloadx4(b_re, &f[j + ht]);
vfmulx4_i(t_re, b_re, s_re_im);
vloadx4(b_im, &f[j + ht + hn]);
vfmulx4_i(t_im, b_im, s_re_im);
vfsubx4(v_re, t_re, t_im);
vfaddx4(v_im, t_re, t_im);
vloadx4(a_re, &f[j]);
vloadx4(a_im, &f[j + hn]);
FWD_BOTx4(a_re, a_im, b_re, b_im, v_re, v_im);
vstorex4(&f[j + ht], b_re);
vstorex4(&f[j], a_re);
vstorex4(&f[j + ht + hn], b_im);
vstorex4(&f[j + hn], a_im);
}
}
/*
* Vectorized 2 layers of Forward FFT for 16 complex points (32 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn2(fpr *f, const unsigned logn, const unsigned level) {
const unsigned int falcon_n = 1 << logn;
const unsigned int hn = falcon_n >> 1;
// Total SIMD register: 26 = 8 + 16 + 2
float64x2x4_t t_re, t_im; // 8
float64x2x2_t x1_re, x2_re, x1_im, x2_im,
y1_re, y2_re, y1_im, y2_im; // 16
float64x2_t s1_re_im, s2_re_im; // 2
const fpr *fpr_tab1 = NULL, *fpr_tab2 = NULL;
unsigned l, len, start, j, k1, k2;
unsigned bar = logn - level + 2;
for (l = level - 1; l > 4; l -= 2) {
len = 1 << (l - 2);
fpr_tab1 = fpr_table[bar++];
fpr_tab2 = fpr_table[bar++];
k1 = 0;
k2 = 0;
for (start = 0; start < hn; start += 1U << l) {
vload(s1_re_im, &fpr_tab1[k1]);
vload(s2_re_im, &fpr_tab2[k2]);
k1 += 2U * ((start & 127) == 64);
k2 += 2;
for (j = start; j < start + len; j += 4) {
vloadx2(y1_re, &f[j + 2 * len]);
vloadx2(y1_im, &f[j + 2 * len + hn]);
vloadx2(y2_re, &f[j + 3 * len]);
vloadx2(y2_im, &f[j + 3 * len + hn]);
FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
vloadx2(x1_re, &f[j]);
vloadx2(x1_im, &f[j + hn]);
vloadx2(x2_re, &f[j + len]);
vloadx2(x2_im, &f[j + len + hn]);
FWD_BOT(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
FWD_BOT(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
FWD_BOT(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
FWD_BOT(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
vstorex2(&f[j], x1_re);
vstorex2(&f[j + hn], x1_im);
vstorex2(&f[j + len], x2_re);
vstorex2(&f[j + len + hn], x2_im);
FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
vstorex2(&f[j + 2 * len], y1_re);
vstorex2(&f[j + 2 * len + hn], y1_im);
vstorex2(&f[j + 3 * len], y2_re);
vstorex2(&f[j + 3 * len + hn], y2_im);
}
start += 1U << l;
if (start >= hn) {
break;
}
vload(s1_re_im, &fpr_tab1[k1]);
vload(s2_re_im, &fpr_tab2[k2]);
k1 += 2U * ((start & 127) == 64);
k2 += 2;
for (j = start; j < start + len; j += 4) {
vloadx2(y1_re, &f[j + 2 * len]);
vloadx2(y1_im, &f[j + 2 * len + hn]);
vloadx2(y2_re, &f[j + 3 * len]);
vloadx2(y2_im, &f[j + 3 * len + hn]);
FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
vloadx2(x1_re, &f[j]);
vloadx2(x1_im, &f[j + hn]);
vloadx2(x2_re, &f[j + len]);
vloadx2(x2_im, &f[j + len + hn]);
FWD_BOTJ(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
FWD_BOTJ(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
FWD_BOTJ(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
FWD_BOTJ(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
vstorex2(&f[j], x1_re);
vstorex2(&f[j + hn], x1_im);
vstorex2(&f[j + len], x2_re);
vstorex2(&f[j + len + hn], x2_im);
FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
vstorex2(&f[j + 2 * len], y1_re);
vstorex2(&f[j + 2 * len + hn], y1_im);
vstorex2(&f[j + 3 * len], y2_re);
vstorex2(&f[j + 3 * len + hn], y2_im);
}
}
}
}
/*
* 1 layer of Inverse FFT for 2 complex points (4 coefficients).
* Note: The scalar version is faster than vectorized code.
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log2(fpr *f) {
fpr x_re, x_im, y_re, y_im, s;
x_re = f[0];
y_re = f[1];
x_im = f[2];
y_im = f[3];
s = fpr_tab_log2[0] * 0.5;
f[0] = (x_re + y_re) * 0.5;
f[2] = (x_im + y_im) * 0.5;
x_re = (x_re - y_re) * s;
x_im = (x_im - y_im) * s;
f[1] = x_im + x_re;
f[3] = x_im - x_re;
}
/*
* Vectorized 2 layers of Inverse FFT for 4 complex point (8 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log3(fpr *f) {
// Total SIMD registers: 12 = 4 + 8
float64x2x4_t tmp; // 4
float64x2x2_t x_re_im, y_re_im, v, s_re_im; // 8
vload2(x_re_im, &f[0]);
vload2(y_re_im, &f[4]);
vfsub(v.val[0], x_re_im.val[0], x_re_im.val[1]);
vfsub(v.val[1], y_re_im.val[0], y_re_im.val[1]);
vfadd(x_re_im.val[0], x_re_im.val[0], x_re_im.val[1]);
vfadd(x_re_im.val[1], y_re_im.val[0], y_re_im.val[1]);
vload2(s_re_im, &fpr_tab_log3[0]);
vfmul(y_re_im.val[0], v.val[1], s_re_im.val[1]);
vfmla(y_re_im.val[0], y_re_im.val[0], v.val[0], s_re_im.val[0]);
vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
vfmls(y_re_im.val[1], y_re_im.val[1], v.val[0], s_re_im.val[1]);
tmp.val[0] = vtrn1q_f64(x_re_im.val[0], y_re_im.val[0]);
tmp.val[1] = vtrn2q_f64(x_re_im.val[0], y_re_im.val[0]);
tmp.val[2] = vtrn1q_f64(x_re_im.val[1], y_re_im.val[1]);
tmp.val[3] = vtrn2q_f64(x_re_im.val[1], y_re_im.val[1]);
s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
vfadd(x_re_im.val[0], tmp.val[0], tmp.val[1]);
vfadd(x_re_im.val[1], tmp.val[2], tmp.val[3]);
vfsub(v.val[0], tmp.val[0], tmp.val[1]);
vfsub(v.val[1], tmp.val[2], tmp.val[3]);
vfmuln(tmp.val[0], x_re_im.val[0], 0.25);
vfmuln(tmp.val[2], x_re_im.val[1], 0.25);
vfmuln(s_re_im.val[0], s_re_im.val[0], 0.25);
vfmul(y_re_im.val[0], v.val[0], s_re_im.val[0]);
vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
vfadd(tmp.val[1], y_re_im.val[1], y_re_im.val[0]);
vfsub(tmp.val[3], y_re_im.val[1], y_re_im.val[0]);
vstorex4(&f[0], tmp);
}
/*
* Vectorized 3 layers of Inverse FFT for 8 complex point (16 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log4(fpr *f) {
// Total SIMD registers: 18 = 12 + 6
float64x2x4_t re, im, t; // 12
float64x2x2_t t_re, t_im, s_re_im; // 6
vload4(re, &f[0]);
vload4(im, &f[8]);
INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
vload2(s_re_im, &fpr_tab_log4[0]);
INV_BOTJ(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
INV_BOTJm(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
// re: 0, 4 | 1, 5 | 2, 6 | 3, 7
// im: 8, 12| 9, 13|10, 14|11, 15
transpose_f64(re, re, t, 0, 1, 0);
transpose_f64(re, re, t, 2, 3, 1);
transpose_f64(im, im, t, 0, 1, 2);
transpose_f64(im, im, t, 2, 3, 3);
// re: 0, 1 | 4, 5 | 2, 3 | 6, 7
// im: 8, 9 | 12, 13|10, 11| 14, 15
t.val[0] = re.val[1];
re.val[1] = re.val[2];
re.val[2] = t.val[0];
t.val[1] = im.val[1];
im.val[1] = im.val[2];
im.val[2] = t.val[1];
// re: 0, 1 | 2, 3| 4, 5 | 6, 7
// im: 8, 9 | 10, 11| 12, 13| 14, 15
INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
vload(s_re_im.val[0], &fpr_tab_log3[0]);
INV_BOTJ_LANE(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
INV_BOTJm_LANE(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[2], im.val[2]);
INV_TOPJ(t_re.val[1], t_im.val[1], re.val[1], im.val[1], re.val[3], im.val[3]);
vfmuln(re.val[0], re.val[0], 0.12500000000);
vfmuln(re.val[1], re.val[1], 0.12500000000);
vfmuln(im.val[0], im.val[0], 0.12500000000);
vfmuln(im.val[1], im.val[1], 0.12500000000);
s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
vfmuln(s_re_im.val[0], s_re_im.val[0], 0.12500000000);
vfmul(t_re.val[0], t_re.val[0], s_re_im.val[0]);
vfmul(t_re.val[1], t_re.val[1], s_re_im.val[0]);
vfmul(t_im.val[0], t_im.val[0], s_re_im.val[0]);
vfmul(t_im.val[1], t_im.val[1], s_re_im.val[0]);
vfsub(im.val[2], t_im.val[0], t_re.val[0]);
vfsub(im.val[3], t_im.val[1], t_re.val[1]);
vfadd(re.val[2], t_im.val[0], t_re.val[0]);
vfadd(re.val[3], t_im.val[1], t_re.val[1]);
vstorex4(&f[0], re);
vstorex4(&f[8], im);
}
/*
* Vectorized 4 layers of Inverse FFT for 16 complex point (32 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(fpr *f, const unsigned logn, const unsigned last) {
// Total SIMD register: 26 = 24 + 2
float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im; // 24
float64x2x2_t s_re_im; // 2
const unsigned n = 1 << logn;
const unsigned hn = n >> 1;
unsigned int level = logn;
const fpr *fpr_tab5 = fpr_table[level--],
*fpr_tab4 = fpr_table[level--],
*fpr_tab3 = fpr_table[level--],
*fpr_tab2 = fpr_table[level];
int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
for (unsigned j = 0; j < hn; j += 16) {
vload4(x_re, &f[j]);
vload4(x_im, &f[j + hn]);
INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
INV_TOPJm(t_re.val[2], t_im.val[2], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
vload4(y_re, &f[j + 8]);
vload4(y_im, &f[j + 8 + hn]);
INV_TOPJ(t_re.val[1], t_im.val[1], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
vload2(s_re_im, &fpr_tab5[k5]);
k5 += 4;
INV_BOTJ(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
INV_BOTJm(x_re.val[3], x_im.val[3], t_re.val[2], t_im.val[2], s_re_im.val[0], s_re_im.val[1]);
vload2(s_re_im, &fpr_tab5[k5]);
k5 += 4;
INV_BOTJ(y_re.val[1], y_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
INV_BOTJm(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0], s_re_im.val[1]);
transpose_f64(x_re, x_re, t_re, 0, 1, 0);
transpose_f64(x_re, x_re, t_re, 2, 3, 1);
transpose_f64(y_re, y_re, t_re, 0, 1, 2);
transpose_f64(y_re, y_re, t_re, 2, 3, 3);
transpose_f64(x_im, x_im, t_im, 0, 1, 0);
transpose_f64(x_im, x_im, t_im, 2, 3, 1);
transpose_f64(y_im, y_im, t_im, 0, 1, 2);
transpose_f64(y_im, y_im, t_im, 2, 3, 3);
t_re.val[0] = x_re.val[1];
x_re.val[1] = x_re.val[2];
x_re.val[2] = t_re.val[0];
t_re.val[1] = y_re.val[1];
y_re.val[1] = y_re.val[2];
y_re.val[2] = t_re.val[1];
t_im.val[0] = x_im.val[1];
x_im.val[1] = x_im.val[2];
x_im.val[2] = t_im.val[0];
t_im.val[1] = y_im.val[1];
y_im.val[1] = y_im.val[2];
y_im.val[2] = t_im.val[1];
INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
INV_TOPJm(t_re.val[1], t_im.val[1], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
INV_TOPJ(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
vloadx2(s_re_im, &fpr_tab4[k4]);
k4 += 4;
INV_BOTJ_LANE(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
INV_BOTJm_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
INV_BOTJ_LANE(y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2], s_re_im.val[1]);
INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[1]);
INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2]);
INV_TOPJ(t_re.val[1], t_im.val[1], x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3]);
INV_TOPJm(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2]);
INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3]);
vload(s_re_im.val[0], &fpr_tab3[k3]);
k3 += 2;
INV_BOTJ_LANE(x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0], s_re_im.val[0]);
INV_BOTJ_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
INV_BOTJm_LANE(y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2], s_re_im.val[0]);
INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0]);
if ((j >> 4) & 1) {
INV_TOPJmx4(t_re, t_im, x_re, x_im, y_re, y_im);
} else {
INV_TOPJx4(t_re, t_im, x_re, x_im, y_re, y_im);
}
vload(s_re_im.val[0], &fpr_tab2[k2]);
k2 += 2 * ((j & 31) == 16);
if (last) {
vfmuln(s_re_im.val[0], s_re_im.val[0], fpr_p2_tab[logn]);
vfmulnx4(x_re, x_re, fpr_p2_tab[logn]);
vfmulnx4(x_im, x_im, fpr_p2_tab[logn]);
}
vstorex4(&f[j], x_re);
vstorex4(&f[j + hn], x_im);
if (logn == 5) {
// Special case in fpr_tab_log2 where re == im
vfmulx4_i(t_re, t_re, s_re_im.val[0]);
vfmulx4_i(t_im, t_im, s_re_im.val[0]);
vfaddx4(y_re, t_im, t_re);
vfsubx4(y_im, t_im, t_re);
} else {
if ((j >> 4) & 1) {
INV_BOTJm_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
} else {
INV_BOTJ_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
}
}
vstorex4(&f[j + 8], y_re);
vstorex4(&f[j + 8 + hn], y_im);
}
}
/*
* Vectorized 1 layer of Inverse FFT for 16 complex points (32 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn1(fpr *f, const unsigned logn, const unsigned last) {
// Total SIMD register 26 = 24 + 2
float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im; // 24
float64x2_t s_re_im; // 2
const unsigned n = 1 << logn;
const unsigned hn = n >> 1;
const unsigned ht = n >> 2;
for (unsigned j = 0; j < ht; j += 8) {
vloadx4(a_re, &f[j]);
vloadx4(a_im, &f[j + hn]);
vloadx4(b_re, &f[j + ht]);
vloadx4(b_im, &f[j + ht + hn]);
INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im);
s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
if (last) {
vfmuln(s_re_im, s_re_im, fpr_p2_tab[logn]);
vfmulnx4(a_re, a_re, fpr_p2_tab[logn]);
vfmulnx4(a_im, a_im, fpr_p2_tab[logn]);
}
vstorex4(&f[j], a_re);
vstorex4(&f[j + hn], a_im);
vfmulx4_i(t_re, t_re, s_re_im);
vfmulx4_i(t_im, t_im, s_re_im);
vfaddx4(b_re, t_im, t_re);
vfsubx4(b_im, t_im, t_re);
vstorex4(&f[j + ht], b_re);
vstorex4(&f[j + ht + hn], b_im);
}
}
/*
* Vectorized 2 layers of Inverse FFT for 16 complex points (32 coefficients).
*/
static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn2(fpr *f, const unsigned logn, const unsigned level, unsigned last) {
const unsigned int falcon_n = 1 << logn;
const unsigned int hn = falcon_n >> 1;
// Total SIMD register: 26 = 16 + 8 + 2
float64x2x4_t t_re, t_im; // 8
float64x2x2_t x1_re, x2_re, x1_im, x2_im,
y1_re, y2_re, y1_im, y2_im; // 16
float64x2_t s1_re_im, s2_re_im; // 2
const fpr *fpr_inv_tab1 = NULL, *fpr_inv_tab2 = NULL;
unsigned l, len, start, j, k1, k2;
unsigned bar = logn - 4;
for (l = 4; l < logn - level - 1; l += 2) {
len = 1 << l;
last -= 1;
fpr_inv_tab1 = fpr_table[bar--];
fpr_inv_tab2 = fpr_table[bar--];
k1 = 0;
k2 = 0;
for (start = 0; start < hn; start += 1U << (l + 2)) {
vload(s1_re_im, &fpr_inv_tab1[k1]);
vload(s2_re_im, &fpr_inv_tab2[k2]);
k1 += 2;
k2 += 2U * ((start & 127) == 64);
if (!last) {
vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
}
for (j = start; j < start + len; j += 4) {
vloadx2(x1_re, &f[j]);
vloadx2(x1_im, &f[j + hn]);
vloadx2(y1_re, &f[j + len]);
vloadx2(y1_im, &f[j + len + hn]);
INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
vloadx2(x2_re, &f[j + 2 * len]);
vloadx2(x2_im, &f[j + 2 * len + hn]);
vloadx2(y2_re, &f[j + 3 * len]);
vloadx2(y2_im, &f[j + 3 * len + hn]);
INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
INV_TOPJ(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
INV_TOPJ(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
INV_BOTJ_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
INV_BOTJ_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
INV_BOTJ_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
INV_BOTJ_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
vstorex2(&f[j + 2 * len], x2_re);
vstorex2(&f[j + 2 * len + hn], x2_im);
vstorex2(&f[j + 3 * len], y2_re);
vstorex2(&f[j + 3 * len + hn], y2_im);
if (!last) {
vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
}
vstorex2(&f[j], x1_re);
vstorex2(&f[j + hn], x1_im);
vstorex2(&f[j + len], y1_re);
vstorex2(&f[j + len + hn], y1_im);
}
start += 1U << (l + 2);
if (start >= hn) {
break;
}
vload(s1_re_im, &fpr_inv_tab1[k1]);
vload(s2_re_im, &fpr_inv_tab2[k2]);
k1 += 2;
k2 += 2U * ((start & 127) == 64);
if (!last) {
vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
}
for (j = start; j < start + len; j += 4) {
vloadx2(x1_re, &f[j]);
vloadx2(x1_im, &f[j + hn]);
vloadx2(y1_re, &f[j + len]);
vloadx2(y1_im, &f[j + len + hn]);
INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
vloadx2(x2_re, &f[j + 2 * len]);
vloadx2(x2_im, &f[j + 2 * len + hn]);
vloadx2(y2_re, &f[j + 3 * len]);
vloadx2(y2_im, &f[j + 3 * len + hn]);
INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
INV_TOPJm(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
INV_TOPJm(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
INV_TOPJm(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
INV_TOPJm(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
INV_BOTJm_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
INV_BOTJm_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
vstorex2(&f[j + 2 * len], x2_re);
vstorex2(&f[j + 2 * len + hn], x2_im);
vstorex2(&f[j + 3 * len], y2_re);
vstorex2(&f[j + 3 * len + hn], y2_im);
if (!last) {
vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
}
vstorex2(&f[j], x1_re);
vstorex2(&f[j + hn], x1_im);
vstorex2(&f[j + len], y1_re);
vstorex2(&f[j + len + hn], y1_im);
}
}
}
}
/*
* Scalable vectorized Forward FFT implementation.
* Support logn from [1, 10]
* Can be easily extended to logn > 10
*/
void PQCLEAN_FALCONPADDED512_AARCH64_FFT(fpr *f, const unsigned logn) {
unsigned level = logn;
switch (logn) {
case 2:
PQCLEAN_FALCONPADDED512_AARCH64_FFT_log2(f);
break;
case 3:
PQCLEAN_FALCONPADDED512_AARCH64_FFT_log3(f);
break;
case 4:
PQCLEAN_FALCONPADDED512_AARCH64_FFT_log4(f);
break;
case 5:
PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, 5);
break;
case 6:
PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn1(f, logn);
PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, logn);
break;
case 7:
case 9:
PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn2(f, logn, level);
PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, logn);
break;
case 8:
case 10:
PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn1(f, logn);
PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn2(f, logn, level - 1);
PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, logn);
break;
default:
break;
}
}
/*
* Scalable vectorized Inverse FFT implementation.
* Support logn from [1, 10]
* Can be easily extended to logn > 10
*/
void PQCLEAN_FALCONPADDED512_AARCH64_iFFT(fpr *f, const unsigned logn) {
const unsigned level = (logn - 5) & 1;
switch (logn) {
case 2:
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log2(f);
break;
case 3:
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log3(f);
break;
case 4:
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log4(f);
break;
case 5:
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, 5, 1);
break;
case 6:
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, logn, 0);
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn1(f, logn, 1);
break;
case 7:
case 9:
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, logn, 0);
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn2(f, logn, level, 1);
break;
case 8:
case 10:
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, logn, 0);
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn2(f, logn, level, 0);
PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn1(f, logn, 1);
break;
default:
break;
}
}