src/sig/falcon/pqclean_falcon-padded-512_aarch64/sampler.c - platform/external/liboqs - Git at Google

 /*
  * Falcon signature generation.
  *
  * ==========================(LICENSE BEGIN)============================
  *
  * Copyright (c) 2017-2019  Falcon Project
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * ===========================(LICENSE END)=============================
  *
  * @author   Thomas Pornin <[email protected]>
  */

 #include "inner.h"
 #include <arm_neon.h>

 /*
  * Sample an integer value along a half-gaussian distribution centered
  * on zero and standard deviation 1.8205, with a precision of 72 bits.
  */
 int
 PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(prng *p) {

     static const uint32_t dist[] = {
         10745844u,  3068844u,  3741698u,
         5559083u,  1580863u,  8248194u,
         2260429u, 13669192u,  2736639u,
         708981u,  4421575u, 10046180u,
         169348u,  7122675u,  4136815u,
         30538u, 13063405u,  7650655u,
         4132u, 14505003u,  7826148u,
         417u, 16768101u, 11363290u,
         31u,  8444042u,  8086568u,
         1u, 12844466u,   265321u,
         0u,  1232676u, 13644283u,
         0u,    38047u,  9111839u,
         0u,      870u,  6138264u,
         0u,       14u, 12545723u,
         0u,        0u,  3104126u,
         0u,        0u,    28824u,
         0u,        0u,      198u,
         0u,        0u,        1u
     };

     uint32_t v0, v1, v2, hi;
     uint64_t lo;
     int z;

     /*
      * Get a random 72-bit value, into three 24-bit limbs v0..v2.
      */
     lo = prng_get_u64(p);
     hi = prng_get_u8(p);
     v0 = (uint32_t)lo & 0xFFFFFF;
     v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
     v2 = (uint32_t)(lo >> 48) | (hi << 16);

     /*
      * Sampled value is z, such that v0..v2 is lower than the first
      * z elements of the table.
      */

     uint32x4x3_t w;
     uint32x4_t x0, x1, x2, cc0, cc1, cc2, zz;
     uint32x2x3_t wh;
     uint32x2_t cc0h, cc1h, cc2h, zzh;
     x0 = vdupq_n_u32(v0);
     x1 = vdupq_n_u32(v1);
     x2 = vdupq_n_u32(v2);

     // 0: 0, 3, 6, 9
     // 1: 1, 4, 7, 10
     // 2: 2, 5, 8, 11
     // v0 - w0
     // v1 - w1
     // v2 - w2
     // cc1 - cc0 >> 31
     // cc2 - cc1 >> 31
     // z + cc2 >> 31
     w = vld3q_u32(&dist[0]);
     cc0 = vsubq_u32(x0, w.val[2]);
     cc1 = vsubq_u32(x1, w.val[1]);
     cc2 = vsubq_u32(x2, w.val[0]);
     cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
     cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
     zz = vshrq_n_u32(cc2, 31);

     w = vld3q_u32(&dist[12]);
     cc0 = vsubq_u32(x0, w.val[2]);
     cc1 = vsubq_u32(x1, w.val[1]);
     cc2 = vsubq_u32(x2, w.val[0]);
     cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
     cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
     zz = vsraq_n_u32(zz, cc2, 31);

     w = vld3q_u32(&dist[24]);
     cc0 = vsubq_u32(x0, w.val[2]);
     cc1 = vsubq_u32(x1, w.val[1]);
     cc2 = vsubq_u32(x2, w.val[0]);
     cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
     cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
     zz = vsraq_n_u32(zz, cc2, 31);

     w = vld3q_u32(&dist[36]);
     cc0 = vsubq_u32(x0, w.val[2]);
     cc1 = vsubq_u32(x1, w.val[1]);
     cc2 = vsubq_u32(x2, w.val[0]);
     cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
     cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
     zz = vsraq_n_u32(zz, cc2, 31);

     // 0: 48, 51
     // 1: 49, 52
     // 2: 50, 53
     wh = vld3_u32(&dist[48]);
     cc0h = vsub_u32(vget_low_u32(x0), wh.val[2]);
     cc1h = vsub_u32(vget_low_u32(x1), wh.val[1]);
     cc2h = vsub_u32(vget_low_u32(x2), wh.val[0]);
     cc1h = (uint32x2_t)vsra_n_s32((int32x2_t)cc1h, (int32x2_t)cc0h, 31);
     cc2h = (uint32x2_t)vsra_n_s32((int32x2_t)cc2h, (int32x2_t)cc1h, 31);
     zzh = vshr_n_u32(cc2h, 31);

     z = (int) (vaddvq_u32(zz) + vaddv_u32(zzh));
     return z;
 }

 /*
  * Sample a bit with probability exp(-x) for some x >= 0.
  */
 static int
 BerExp(prng *p, fpr x, fpr ccs) {
     int s, i;
     fpr r;
     uint32_t sw, w;
     uint64_t z;

     /*
      * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
      * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
      */
     s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
     r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));

     /*
      * It may happen (quite rarely) that s >= 64; if sigma = 1.2
      * (the minimum value for sigma), r = 0 and b = 1, then we get
      * s >= 64 if the half-Gaussian produced a z >= 13, which happens
      * with probability about 0.000000000230383991, which is
      * approximatively equal to 2^(-32). In any case, if s >= 64,
      * then BerExp will be non-zero with probability less than
      * 2^(-64), so we can simply saturate s at 63.
      */
     sw = (uint32_t)s;
     sw ^= (sw ^ 63) & -((63 - sw) >> 31);
     s = (int)sw;

     /*
      * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
      * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
      * We scale it up to 2^64, then right-shift it by s bits because
      * we really want exp(-x) = 2^(-s)*exp(-r).
      *
      * The "-1" operation makes sure that the value fits on 64 bits
      * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
      * case). The bias is negligible since fpr_expm_p63() only computes
      * with 51 bits of precision or so.
      */
     z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;

     /*
      * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
      * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
      * PRNG output to limit its consumption, the sign of the difference
      * yields the expected result.
      */
     i = 64;
     do {
         i -= 8;
         w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
     } while (!w && i > 0);
     return (int)(w >> 31);
 }

 /*
  * The sampler produces a random integer that follows a discrete Gaussian
  * distribution, centered on mu, and with standard deviation sigma. The
  * provided parameter isigma is equal to 1/sigma.
  *
  * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
  * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
  */
 int
 PQCLEAN_FALCONPADDED512_AARCH64_sampler(void *ctx, fpr mu, fpr isigma) {
     sampler_context *spc;
     int s;
     fpr r, dss, ccs;

     spc = ctx;

     /*
      * Center is mu. We compute mu = s + r where s is an integer
      * and 0 <= r < 1.
      */
     s = (int)fpr_floor(mu);
     r = fpr_sub(mu, fpr_of(s));

     /*
      * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
      */
     dss = fpr_half(fpr_sqr(isigma));

     /*
      * ccs = sigma_min / sigma = sigma_min * isigma.
      */
     ccs = fpr_mul(isigma, spc->sigma_min);

     /*
      * We now need to sample on center r.
      */
     for (;;) {
         int z0, z, b;
         fpr x;

         /*
          * Sample z for a Gaussian distribution. Then get a
          * random bit b to turn the sampling into a bimodal
          * distribution: if b = 1, we use z+1, otherwise we
          * use -z. We thus have two situations:
          *
          *  - b = 1: z >= 1 and sampled against a Gaussian
          *    centered on 1.
          *  - b = 0: z <= 0 and sampled against a Gaussian
          *    centered on 0.
          */
         z0 = PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(&spc->p);
         b = (int)prng_get_u8(&spc->p) & 1;
         z = b + ((b << 1) - 1) * z0;

         /*
          * Rejection sampling. We want a Gaussian centered on r;
          * but we sampled against a Gaussian centered on b (0 or
          * 1). But we know that z is always in the range where
          * our sampling distribution is greater than the Gaussian
          * distribution, so rejection works.
          *
          * We got z with distribution:
          *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
          * We target distribution:
          *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
          * Rejection sampling works by keeping the value z with
          * probability S(z)/G(z), and starting again otherwise.
          * This requires S(z) <= G(z), which is the case here.
          * Thus, we simply need to keep our z with probability:
          *    P = exp(-x)
          * where:
          *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
          *
          * Here, we scale up the Bernouilli distribution, which
          * makes rejection more probable, but makes rejection
          * rate sufficiently decorrelated from the Gaussian
          * center and standard deviation that the whole sampler
          * can be said to be constant-time.
          */
         x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
         x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
         if (BerExp(&spc->p, x, ccs)) {
             /*
              * Rejection sampling was centered on r, but the
              * actual center is mu = s + r.
              */
             return s + z;
         }
     }
 }
	/*
	* Falcon signature generation.
	*
	* ==========================(LICENSE BEGIN)============================
	*
	* Copyright (c) 2017-2019 Falcon Project
	*
	* Permission is hereby granted, free of charge, to any person obtaining
	* a copy of this software and associated documentation files (the
	* "Software"), to deal in the Software without restriction, including
	* without limitation the rights to use, copy, modify, merge, publish,
	* distribute, sublicense, and/or sell copies of the Software, and to
	* permit persons to whom the Software is furnished to do so, subject to
	* the following conditions:
	*
	* The above copyright notice and this permission notice shall be
	* included in all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	*
	* ===========================(LICENSE END)=============================
	*
	* @author Thomas Pornin <[email protected]>
	*/

	#include "inner.h"
	#include <arm_neon.h>

	/*
	* Sample an integer value along a half-gaussian distribution centered
	* on zero and standard deviation 1.8205, with a precision of 72 bits.
	*/
	int
	PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(prng *p) {

	static const uint32_t dist[] = {
	10745844u, 3068844u, 3741698u,
	5559083u, 1580863u, 8248194u,
	2260429u, 13669192u, 2736639u,
	708981u, 4421575u, 10046180u,
	169348u, 7122675u, 4136815u,
	30538u, 13063405u, 7650655u,
	4132u, 14505003u, 7826148u,
	417u, 16768101u, 11363290u,
	31u, 8444042u, 8086568u,
	1u, 12844466u, 265321u,
	0u, 1232676u, 13644283u,
	0u, 38047u, 9111839u,
	0u, 870u, 6138264u,
	0u, 14u, 12545723u,
	0u, 0u, 3104126u,
	0u, 0u, 28824u,
	0u, 0u, 198u,
	0u, 0u, 1u
	};

	uint32_t v0, v1, v2, hi;
	uint64_t lo;
	int z;

	/*
	* Get a random 72-bit value, into three 24-bit limbs v0..v2.
	*/
	lo = prng_get_u64(p);
	hi = prng_get_u8(p);
	v0 = (uint32_t)lo & 0xFFFFFF;
	v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
	v2 = (uint32_t)(lo >> 48) \| (hi << 16);

	/*
	* Sampled value is z, such that v0..v2 is lower than the first
	* z elements of the table.
	*/

	uint32x4x3_t w;
	uint32x4_t x0, x1, x2, cc0, cc1, cc2, zz;
	uint32x2x3_t wh;
	uint32x2_t cc0h, cc1h, cc2h, zzh;
	x0 = vdupq_n_u32(v0);
	x1 = vdupq_n_u32(v1);
	x2 = vdupq_n_u32(v2);

	// 0: 0, 3, 6, 9
	// 1: 1, 4, 7, 10
	// 2: 2, 5, 8, 11
	// v0 - w0
	// v1 - w1
	// v2 - w2
	// cc1 - cc0 >> 31
	// cc2 - cc1 >> 31
	// z + cc2 >> 31
	w = vld3q_u32(&dist[0]);
	cc0 = vsubq_u32(x0, w.val[2]);
	cc1 = vsubq_u32(x1, w.val[1]);
	cc2 = vsubq_u32(x2, w.val[0]);
	cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
	cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
	zz = vshrq_n_u32(cc2, 31);

	w = vld3q_u32(&dist[12]);
	cc0 = vsubq_u32(x0, w.val[2]);
	cc1 = vsubq_u32(x1, w.val[1]);
	cc2 = vsubq_u32(x2, w.val[0]);
	cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
	cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
	zz = vsraq_n_u32(zz, cc2, 31);

	w = vld3q_u32(&dist[24]);
	cc0 = vsubq_u32(x0, w.val[2]);
	cc1 = vsubq_u32(x1, w.val[1]);
	cc2 = vsubq_u32(x2, w.val[0]);
	cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
	cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
	zz = vsraq_n_u32(zz, cc2, 31);

	w = vld3q_u32(&dist[36]);
	cc0 = vsubq_u32(x0, w.val[2]);
	cc1 = vsubq_u32(x1, w.val[1]);
	cc2 = vsubq_u32(x2, w.val[0]);
	cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
	cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
	zz = vsraq_n_u32(zz, cc2, 31);

	// 0: 48, 51
	// 1: 49, 52
	// 2: 50, 53
	wh = vld3_u32(&dist[48]);
	cc0h = vsub_u32(vget_low_u32(x0), wh.val[2]);
	cc1h = vsub_u32(vget_low_u32(x1), wh.val[1]);
	cc2h = vsub_u32(vget_low_u32(x2), wh.val[0]);
	cc1h = (uint32x2_t)vsra_n_s32((int32x2_t)cc1h, (int32x2_t)cc0h, 31);
	cc2h = (uint32x2_t)vsra_n_s32((int32x2_t)cc2h, (int32x2_t)cc1h, 31);
	zzh = vshr_n_u32(cc2h, 31);

	z = (int) (vaddvq_u32(zz) + vaddv_u32(zzh));
	return z;
	}

	/*
	* Sample a bit with probability exp(-x) for some x >= 0.
	*/
	static int
	BerExp(prng *p, fpr x, fpr ccs) {
	int s, i;
	fpr r;
	uint32_t sw, w;
	uint64_t z;

	/*
	* Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
	* and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
	*/
	s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
	r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));

	/*
	* It may happen (quite rarely) that s >= 64; if sigma = 1.2
	* (the minimum value for sigma), r = 0 and b = 1, then we get
	* s >= 64 if the half-Gaussian produced a z >= 13, which happens
	* with probability about 0.000000000230383991, which is
	* approximatively equal to 2^(-32). In any case, if s >= 64,
	* then BerExp will be non-zero with probability less than
	* 2^(-64), so we can simply saturate s at 63.
	*/
	sw = (uint32_t)s;
	sw ^= (sw ^ 63) & -((63 - sw) >> 31);
	s = (int)sw;

	/*
	* Compute exp(-r); we know that 0 <= r < log(2) at this point, so
	* we can use fpr_expm_p63(), which yields a result scaled to 2^63.
	* We scale it up to 2^64, then right-shift it by s bits because
	* we really want exp(-x) = 2^(-s)*exp(-r).
	*
	* The "-1" operation makes sure that the value fits on 64 bits
	* (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
	* case). The bias is negligible since fpr_expm_p63() only computes
	* with 51 bits of precision or so.
	*/
	z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;

	/*
	* Sample a bit with probability exp(-x). Since x = s*log(2) + r,
	* exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
	* PRNG output to limit its consumption, the sign of the difference
	* yields the expected result.
	*/
	i = 64;
	do {
	i -= 8;
	w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
	} while (!w && i > 0);
	return (int)(w >> 31);
	}

	/*
	* The sampler produces a random integer that follows a discrete Gaussian
	* distribution, centered on mu, and with standard deviation sigma. The
	* provided parameter isigma is equal to 1/sigma.
	*
	* The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
	* 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
	*/
	int
	PQCLEAN_FALCONPADDED512_AARCH64_sampler(void *ctx, fpr mu, fpr isigma) {
	sampler_context *spc;
	int s;
	fpr r, dss, ccs;

	spc = ctx;

	/*
	* Center is mu. We compute mu = s + r where s is an integer
	* and 0 <= r < 1.
	*/
	s = (int)fpr_floor(mu);
	r = fpr_sub(mu, fpr_of(s));

	/*
	* dss = 1/(2sigma^2) = 0.5(isigma^2).
	*/
	dss = fpr_half(fpr_sqr(isigma));

	/*
	* ccs = sigma_min / sigma = sigma_min * isigma.
	*/
	ccs = fpr_mul(isigma, spc->sigma_min);

	/*
	* We now need to sample on center r.
	*/
	for (;;) {
	int z0, z, b;
	fpr x;

	/*
	* Sample z for a Gaussian distribution. Then get a
	* random bit b to turn the sampling into a bimodal
	* distribution: if b = 1, we use z+1, otherwise we
	* use -z. We thus have two situations:
	*
	* - b = 1: z >= 1 and sampled against a Gaussian
	* centered on 1.
	* - b = 0: z <= 0 and sampled against a Gaussian
	* centered on 0.
	*/
	z0 = PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(&spc->p);
	b = (int)prng_get_u8(&spc->p) & 1;
	z = b + ((b << 1) - 1) * z0;

	/*
	* Rejection sampling. We want a Gaussian centered on r;
	* but we sampled against a Gaussian centered on b (0 or
	* 1). But we know that z is always in the range where
	* our sampling distribution is greater than the Gaussian
	* distribution, so rejection works.
	*
	* We got z with distribution:
	* G(z) = exp(-((z-b)^2)/(2*sigma0^2))
	* We target distribution:
	* S(z) = exp(-((z-r)^2)/(2*sigma^2))
	* Rejection sampling works by keeping the value z with
	* probability S(z)/G(z), and starting again otherwise.
	* This requires S(z) <= G(z), which is the case here.
	* Thus, we simply need to keep our z with probability:
	* P = exp(-x)
	* where:
	* x = ((z-r)^2)/(2sigma^2) - ((z-b)^2)/(2sigma0^2)
	*
	* Here, we scale up the Bernouilli distribution, which
	* makes rejection more probable, but makes rejection
	* rate sufficiently decorrelated from the Gaussian
	* center and standard deviation that the whole sampler
	* can be said to be constant-time.
	*/
	x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
	x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
	if (BerExp(&spc->p, x, ccs)) {
	/*
	* Rejection sampling was centered on r, but the
	* actual center is mu = s + r.
	*/
	return s + z;
	}
	}
	}