blob: e77dc4b525ae93a41b11b30bea0d48816f0b36e0 [file] [log] [blame]
/*
* Falcon signature generation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2017-2019 Falcon Project
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <[email protected]>
*/
#include "inner.h"
#include <arm_neon.h>
/*
* Sample an integer value along a half-gaussian distribution centered
* on zero and standard deviation 1.8205, with a precision of 72 bits.
*/
int
PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(prng *p) {
static const uint32_t dist[] = {
10745844u, 3068844u, 3741698u,
5559083u, 1580863u, 8248194u,
2260429u, 13669192u, 2736639u,
708981u, 4421575u, 10046180u,
169348u, 7122675u, 4136815u,
30538u, 13063405u, 7650655u,
4132u, 14505003u, 7826148u,
417u, 16768101u, 11363290u,
31u, 8444042u, 8086568u,
1u, 12844466u, 265321u,
0u, 1232676u, 13644283u,
0u, 38047u, 9111839u,
0u, 870u, 6138264u,
0u, 14u, 12545723u,
0u, 0u, 3104126u,
0u, 0u, 28824u,
0u, 0u, 198u,
0u, 0u, 1u
};
uint32_t v0, v1, v2, hi;
uint64_t lo;
int z;
/*
* Get a random 72-bit value, into three 24-bit limbs v0..v2.
*/
lo = prng_get_u64(p);
hi = prng_get_u8(p);
v0 = (uint32_t)lo & 0xFFFFFF;
v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
v2 = (uint32_t)(lo >> 48) | (hi << 16);
/*
* Sampled value is z, such that v0..v2 is lower than the first
* z elements of the table.
*/
uint32x4x3_t w;
uint32x4_t x0, x1, x2, cc0, cc1, cc2, zz;
uint32x2x3_t wh;
uint32x2_t cc0h, cc1h, cc2h, zzh;
x0 = vdupq_n_u32(v0);
x1 = vdupq_n_u32(v1);
x2 = vdupq_n_u32(v2);
// 0: 0, 3, 6, 9
// 1: 1, 4, 7, 10
// 2: 2, 5, 8, 11
// v0 - w0
// v1 - w1
// v2 - w2
// cc1 - cc0 >> 31
// cc2 - cc1 >> 31
// z + cc2 >> 31
w = vld3q_u32(&dist[0]);
cc0 = vsubq_u32(x0, w.val[2]);
cc1 = vsubq_u32(x1, w.val[1]);
cc2 = vsubq_u32(x2, w.val[0]);
cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
zz = vshrq_n_u32(cc2, 31);
w = vld3q_u32(&dist[12]);
cc0 = vsubq_u32(x0, w.val[2]);
cc1 = vsubq_u32(x1, w.val[1]);
cc2 = vsubq_u32(x2, w.val[0]);
cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
zz = vsraq_n_u32(zz, cc2, 31);
w = vld3q_u32(&dist[24]);
cc0 = vsubq_u32(x0, w.val[2]);
cc1 = vsubq_u32(x1, w.val[1]);
cc2 = vsubq_u32(x2, w.val[0]);
cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
zz = vsraq_n_u32(zz, cc2, 31);
w = vld3q_u32(&dist[36]);
cc0 = vsubq_u32(x0, w.val[2]);
cc1 = vsubq_u32(x1, w.val[1]);
cc2 = vsubq_u32(x2, w.val[0]);
cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
zz = vsraq_n_u32(zz, cc2, 31);
// 0: 48, 51
// 1: 49, 52
// 2: 50, 53
wh = vld3_u32(&dist[48]);
cc0h = vsub_u32(vget_low_u32(x0), wh.val[2]);
cc1h = vsub_u32(vget_low_u32(x1), wh.val[1]);
cc2h = vsub_u32(vget_low_u32(x2), wh.val[0]);
cc1h = (uint32x2_t)vsra_n_s32((int32x2_t)cc1h, (int32x2_t)cc0h, 31);
cc2h = (uint32x2_t)vsra_n_s32((int32x2_t)cc2h, (int32x2_t)cc1h, 31);
zzh = vshr_n_u32(cc2h, 31);
z = (int) (vaddvq_u32(zz) + vaddv_u32(zzh));
return z;
}
/*
* Sample a bit with probability exp(-x) for some x >= 0.
*/
static int
BerExp(prng *p, fpr x, fpr ccs) {
int s, i;
fpr r;
uint32_t sw, w;
uint64_t z;
/*
* Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
* and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
*/
s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
/*
* It may happen (quite rarely) that s >= 64; if sigma = 1.2
* (the minimum value for sigma), r = 0 and b = 1, then we get
* s >= 64 if the half-Gaussian produced a z >= 13, which happens
* with probability about 0.000000000230383991, which is
* approximatively equal to 2^(-32). In any case, if s >= 64,
* then BerExp will be non-zero with probability less than
* 2^(-64), so we can simply saturate s at 63.
*/
sw = (uint32_t)s;
sw ^= (sw ^ 63) & -((63 - sw) >> 31);
s = (int)sw;
/*
* Compute exp(-r); we know that 0 <= r < log(2) at this point, so
* we can use fpr_expm_p63(), which yields a result scaled to 2^63.
* We scale it up to 2^64, then right-shift it by s bits because
* we really want exp(-x) = 2^(-s)*exp(-r).
*
* The "-1" operation makes sure that the value fits on 64 bits
* (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
* case). The bias is negligible since fpr_expm_p63() only computes
* with 51 bits of precision or so.
*/
z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
/*
* Sample a bit with probability exp(-x). Since x = s*log(2) + r,
* exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
* PRNG output to limit its consumption, the sign of the difference
* yields the expected result.
*/
i = 64;
do {
i -= 8;
w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
} while (!w && i > 0);
return (int)(w >> 31);
}
/*
* The sampler produces a random integer that follows a discrete Gaussian
* distribution, centered on mu, and with standard deviation sigma. The
* provided parameter isigma is equal to 1/sigma.
*
* The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
* 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
*/
int
PQCLEAN_FALCONPADDED512_AARCH64_sampler(void *ctx, fpr mu, fpr isigma) {
sampler_context *spc;
int s;
fpr r, dss, ccs;
spc = ctx;
/*
* Center is mu. We compute mu = s + r where s is an integer
* and 0 <= r < 1.
*/
s = (int)fpr_floor(mu);
r = fpr_sub(mu, fpr_of(s));
/*
* dss = 1/(2*sigma^2) = 0.5*(isigma^2).
*/
dss = fpr_half(fpr_sqr(isigma));
/*
* ccs = sigma_min / sigma = sigma_min * isigma.
*/
ccs = fpr_mul(isigma, spc->sigma_min);
/*
* We now need to sample on center r.
*/
for (;;) {
int z0, z, b;
fpr x;
/*
* Sample z for a Gaussian distribution. Then get a
* random bit b to turn the sampling into a bimodal
* distribution: if b = 1, we use z+1, otherwise we
* use -z. We thus have two situations:
*
* - b = 1: z >= 1 and sampled against a Gaussian
* centered on 1.
* - b = 0: z <= 0 and sampled against a Gaussian
* centered on 0.
*/
z0 = PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(&spc->p);
b = (int)prng_get_u8(&spc->p) & 1;
z = b + ((b << 1) - 1) * z0;
/*
* Rejection sampling. We want a Gaussian centered on r;
* but we sampled against a Gaussian centered on b (0 or
* 1). But we know that z is always in the range where
* our sampling distribution is greater than the Gaussian
* distribution, so rejection works.
*
* We got z with distribution:
* G(z) = exp(-((z-b)^2)/(2*sigma0^2))
* We target distribution:
* S(z) = exp(-((z-r)^2)/(2*sigma^2))
* Rejection sampling works by keeping the value z with
* probability S(z)/G(z), and starting again otherwise.
* This requires S(z) <= G(z), which is the case here.
* Thus, we simply need to keep our z with probability:
* P = exp(-x)
* where:
* x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
*
* Here, we scale up the Bernouilli distribution, which
* makes rejection more probable, but makes rejection
* rate sufficiently decorrelated from the Gaussian
* center and standard deviation that the whole sampler
* can be said to be constant-time.
*/
x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
if (BerExp(&spc->p, x, ccs)) {
/*
* Rejection sampling was centered on r, but the
* actual center is mu = s + r.
*/
return s + z;
}
}
}