src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.h - platform/external/liboqs - Git at Google

 /*
  * Floating-point operations.
  *
  * ==========================(LICENSE BEGIN)============================
  *
  * Copyright (c) 2017-2019  Falcon Project
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * ===========================(LICENSE END)=============================
  *
  * @author   Thomas Pornin <[email protected]>
  */

 /* ====================================================================== */

 #include <math.h>

 /*
  * We wrap the native 'double' type into a structure so that the C compiler
  * complains if we inadvertently use raw arithmetic operators on the 'fpr'
  * type instead of using the inline functions below. This should have no
  * extra runtime cost, since all the functions below are 'inline'.
  */
 typedef struct {
     double v;
 } fpr;

 static inline fpr
 FPR(double v) {
     fpr x;

     x.v = v;
     return x;
 }

 static inline fpr
 fpr_of(int64_t i) {
     return FPR((double)i);
 }

 static const fpr fpr_q = { 12289.0 };
 static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
 static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
 static const fpr fpr_inv_sigma[] = {
     { 0.0 }, /* unused */
     { 0.0069054793295940891952143765991630516 },
     { 0.0068102267767177975961393730687908629 },
     { 0.0067188101910722710707826117910434131 },
     { 0.0065883354370073665545865037227681924 },
     { 0.0064651781207602900738053897763485516 },
     { 0.0063486788828078995327741182928037856 },
     { 0.0062382586529084374473367528433697537 },
     { 0.0061334065020930261548984001431770281 },
     { 0.0060336696681577241031668062510953022 },
     { 0.0059386453095331159950250124336477482 }
 };
 static const fpr fpr_sigma_min[] = {
     { 0.0 }, /* unused */
     { 1.1165085072329102588881898380334015 },
     { 1.1321247692325272405718031785357108 },
     { 1.1475285353733668684571123112513188 },
     { 1.1702540788534828939713084716509250 },
     { 1.1925466358390344011122170489094133 },
     { 1.2144300507766139921088487776957699 },
     { 1.2359260567719808790104525941706723 },
     { 1.2570545284063214162779743112075080 },
     { 1.2778336969128335860256340575729042 },
     { 1.2982803343442918539708792538826807 }
 };
 static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
 static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
 static const fpr fpr_bnorm_max = { 16822.4121 };
 static const fpr fpr_zero = { 0.0 };
 static const fpr fpr_one = { 1.0 };
 static const fpr fpr_two = { 2.0 };
 static const fpr fpr_onehalf = { 0.5 };
 static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
 static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
 static const fpr fpr_ptwo31 = { 2147483648.0 };
 static const fpr fpr_ptwo31m1 = { 2147483647.0 };
 static const fpr fpr_mtwo31m1 = { -2147483647.0 };
 static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
 static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
 static const fpr fpr_ptwo63 = { 9223372036854775808.0 };

 static inline int64_t
 fpr_rint(fpr x) {
     /*
      * We do not want to use llrint() since it might be not
      * constant-time.
      *
      * Suppose that x >= 0. If x >= 2^52, then it is already an
      * integer. Otherwise, if x < 2^52, then computing x+2^52 will
      * yield a value that will be rounded to the nearest integer
      * with exactly the right rules (round-to-nearest-even).
      *
      * In order to have constant-time processing, we must do the
      * computation for both x >= 0 and x < 0 cases, and use a
      * cast to an integer to access the sign and select the proper
      * value. Such casts also allow us to find out if |x| < 2^52.
      */
     int64_t sx, tx, rp, rn, m;
     uint32_t ub;

     sx = (int64_t)(x.v - 1.0);
     tx = (int64_t)x.v;
     rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
     rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;

     /*
      * If tx >= 2^52 or tx < -2^52, then result is tx.
      * Otherwise, if sx >= 0, then result is rp.
      * Otherwise, result is rn. We use the fact that when x is
      * close to 0 (|x| <= 0.25) then both rp and rn are correct;
      * and if x is not close to 0, then trunc(x-1.0) yields the
      * appropriate sign.
      */

     /*
      * Clamp rp to zero if tx < 0.
      * Clamp rn to zero if tx >= 0.
      */
     m = sx >> 63;
     rn &= m;
     rp &= ~m;

     /*
      * Get the 12 upper bits of tx; if they are not all zeros or
      * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
      * rp and rn to zero. Otherwise, we clamp tx to zero.
      */
     ub = (uint32_t)((uint64_t)tx >> 52);
     m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
     rp &= m;
     rn &= m;
     tx &= ~m;

     /*
      * Only one of tx, rn or rp (at most) can be non-zero at this
      * point.
      */
     return tx | rn | rp;
 }

 static inline int64_t
 fpr_floor(fpr x) {
     int64_t r;

     /*
      * The cast performs a trunc() (rounding toward 0) and thus is
      * wrong by 1 for most negative values. The correction below is
      * constant-time as long as the compiler turns the
      * floating-point conversion result into a 0/1 integer without a
      * conditional branch or another non-constant-time construction.
      * This should hold on all modern architectures with an FPU (and
      * if it is false on a given arch, then chances are that the FPU
      * itself is not constant-time, making the point moot).
      */
     r = (int64_t)x.v;
     return r - (x.v < (double)r);
 }

 static inline int64_t
 fpr_trunc(fpr x) {
     return (int64_t)x.v;
 }

 static inline fpr
 fpr_add(fpr x, fpr y) {
     return FPR(x.v + y.v);
 }

 static inline fpr
 fpr_sub(fpr x, fpr y) {
     return FPR(x.v - y.v);
 }

 static inline fpr
 fpr_neg(fpr x) {
     return FPR(-x.v);
 }

 static inline fpr
 fpr_half(fpr x) {
     return FPR(x.v * 0.5);
 }

 static inline fpr
 fpr_double(fpr x) {
     return FPR(x.v + x.v);
 }

 static inline fpr
 fpr_mul(fpr x, fpr y) {
     return FPR(x.v * y.v);
 }

 static inline fpr
 fpr_sqr(fpr x) {
     return FPR(x.v * x.v);
 }

 static inline fpr
 fpr_inv(fpr x) {
     return FPR(1.0 / x.v);
 }

 static inline fpr
 fpr_div(fpr x, fpr y) {
     return FPR(x.v / y.v);
 }

 static inline void
 fpr_sqrt_avx2(double *t) {
     __m128d x;

     x = _mm_load1_pd(t);
     x = _mm_sqrt_pd(x);
     _mm_storel_pd(t, x);
 }

 static inline fpr
 fpr_sqrt(fpr x) {
     /*
      * We prefer not to have a dependency on libm when it can be
      * avoided. On x86, calling the sqrt() libm function inlines
      * the relevant opcode (fsqrt or sqrtsd, depending on whether
      * the 387 FPU or SSE2 is used for floating-point operations)
      * but then makes an optional call to the library function
      * for proper error handling, in case the operand is negative.
      *
      * To avoid this dependency, we use intrinsics or inline assembly
      * on recognized platforms:
      *
      *  - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
      *
      *  - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
      *
      *  - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
      *    to call the 387 FPU fsqrt opcode.
      *
      *  - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
      *    the fsqrt opcode (Clang needs a special hack).
      *
      *  - On GCC/Clang on ARM with hardware floating-point, we use
      *    inline assembly to call the vqsrt.f64 opcode. Due to a
      *    complex ecosystem of compilers and assembly syntaxes, we
      *    have to call it "fsqrt" or "fsqrtd", depending on case.
      *
      * If the platform is not recognized, a call to the system
      * library function sqrt() is performed. On some compilers, this
      * may actually inline the relevant opcode, and call the library
      * function only when the input is invalid (e.g. negative);
      * Falcon never actually calls sqrt() on a negative value, but
      * the dependency to libm will still be there.
      */

     fpr_sqrt_avx2(&x.v);
     return x;
 }

 static inline int
 fpr_lt(fpr x, fpr y) {
     return x.v < y.v;
 }

 static inline uint64_t
 fpr_expm_p63(fpr x, fpr ccs) {
     /*
      * Polynomial approximation of exp(-x) is taken from FACCT:
      *   https://eprint.iacr.org/2018/1234
      * Specifically, values are extracted from the implementation
      * referenced from the FACCT article, and available at:
      *   https://github.com/raykzhao/gaussian
      * Tests over more than 24 billions of random inputs in the
      * 0..log(2) range have never shown a deviation larger than
      * 2^(-50) from the true mathematical value.
      */

     /*
      * AVX2 implementation uses more operations than Horner's method,
      * but with a lower expression tree depth. This helps because
      * additions and multiplications have a latency of 4 cycles on
      * a Skylake, but the CPU can issue two of them per cycle.
      */

     static const union {
         double d[12];
         __m256d v[3];
     } c = {
         {
             0.999999999999994892974086724280,
             0.500000000000019206858326015208,
             0.166666666666984014666397229121,
             0.041666666666110491190622155955,
             0.008333333327800835146903501993,
             0.001388888894063186997887560103,
             0.000198412739277311890541063977,
             0.000024801566833585381209939524,
             0.000002755586350219122514855659,
             0.000000275607356160477811864927,
             0.000000025299506379442070029551,
             0.000000002073772366009083061987
         }
     };

     double d1, d2, d4, d8, y;
     __m256d d14, d58, d9c;

     d1 = -x.v;
     d2 = d1 * d1;
     d4 = d2 * d2;
     d8 = d4 * d4;
     d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
     d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
     d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
     d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
     d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
     d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
     d9c = _mm256_hadd_pd(d9c, d9c);
     y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
         + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
     y *= ccs.v;

     /*
      * Final conversion goes through int64_t first, because that's what
      * the underlying opcode (vcvttsd2si) will do, and we know that the
      * result will fit, since x >= 0 and ccs < 1. If we did the
      * conversion directly to uint64_t, then the compiler would add some
      * extra code to cover the case of a source value of 2^63 or more,
      * and though the alternate path would never be exercised, the
      * extra comparison would cost us some cycles.
      */
     return (uint64_t)(int64_t)(y * fpr_ptwo63.v);

 }

 #define fpr_gm_tab   PQCLEAN_FALCONPADDED512_AVX2_fpr_gm_tab
 extern const fpr fpr_gm_tab[];

 #define fpr_p2_tab   PQCLEAN_FALCONPADDED512_AVX2_fpr_p2_tab
 extern const fpr fpr_p2_tab[];

 /* ====================================================================== */
	/*
	* Floating-point operations.
	*
	* ==========================(LICENSE BEGIN)============================
	*
	* Copyright (c) 2017-2019 Falcon Project
	*
	* Permission is hereby granted, free of charge, to any person obtaining
	* a copy of this software and associated documentation files (the
	* "Software"), to deal in the Software without restriction, including
	* without limitation the rights to use, copy, modify, merge, publish,
	* distribute, sublicense, and/or sell copies of the Software, and to
	* permit persons to whom the Software is furnished to do so, subject to
	* the following conditions:
	*
	* The above copyright notice and this permission notice shall be
	* included in all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	*
	* ===========================(LICENSE END)=============================
	*
	* @author Thomas Pornin <[email protected]>
	*/

	/* ====================================================================== */

	#include <math.h>

	/*
	* We wrap the native 'double' type into a structure so that the C compiler
	* complains if we inadvertently use raw arithmetic operators on the 'fpr'
	* type instead of using the inline functions below. This should have no
	* extra runtime cost, since all the functions below are 'inline'.
	*/
	typedef struct {
	double v;
	} fpr;

	static inline fpr
	FPR(double v) {
	fpr x;

	x.v = v;
	return x;
	}

	static inline fpr
	fpr_of(int64_t i) {
	return FPR((double)i);
	}

	static const fpr fpr_q = { 12289.0 };
	static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
	static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
	static const fpr fpr_inv_sigma[] = {
	{ 0.0 }, /* unused */
	{ 0.0069054793295940891952143765991630516 },
	{ 0.0068102267767177975961393730687908629 },
	{ 0.0067188101910722710707826117910434131 },
	{ 0.0065883354370073665545865037227681924 },
	{ 0.0064651781207602900738053897763485516 },
	{ 0.0063486788828078995327741182928037856 },
	{ 0.0062382586529084374473367528433697537 },
	{ 0.0061334065020930261548984001431770281 },
	{ 0.0060336696681577241031668062510953022 },
	{ 0.0059386453095331159950250124336477482 }
	};
	static const fpr fpr_sigma_min[] = {
	{ 0.0 }, /* unused */
	{ 1.1165085072329102588881898380334015 },
	{ 1.1321247692325272405718031785357108 },
	{ 1.1475285353733668684571123112513188 },
	{ 1.1702540788534828939713084716509250 },
	{ 1.1925466358390344011122170489094133 },
	{ 1.2144300507766139921088487776957699 },
	{ 1.2359260567719808790104525941706723 },
	{ 1.2570545284063214162779743112075080 },
	{ 1.2778336969128335860256340575729042 },
	{ 1.2982803343442918539708792538826807 }
	};
	static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
	static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
	static const fpr fpr_bnorm_max = { 16822.4121 };
	static const fpr fpr_zero = { 0.0 };
	static const fpr fpr_one = { 1.0 };
	static const fpr fpr_two = { 2.0 };
	static const fpr fpr_onehalf = { 0.5 };
	static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
	static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
	static const fpr fpr_ptwo31 = { 2147483648.0 };
	static const fpr fpr_ptwo31m1 = { 2147483647.0 };
	static const fpr fpr_mtwo31m1 = { -2147483647.0 };
	static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
	static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
	static const fpr fpr_ptwo63 = { 9223372036854775808.0 };

	static inline int64_t
	fpr_rint(fpr x) {
	/*
	* We do not want to use llrint() since it might be not
	* constant-time.
	*
	* Suppose that x >= 0. If x >= 2^52, then it is already an
	* integer. Otherwise, if x < 2^52, then computing x+2^52 will
	* yield a value that will be rounded to the nearest integer
	* with exactly the right rules (round-to-nearest-even).
	*
	* In order to have constant-time processing, we must do the
	* computation for both x >= 0 and x < 0 cases, and use a
	* cast to an integer to access the sign and select the proper
	* value. Such casts also allow us to find out if \|x\| < 2^52.
	*/
	int64_t sx, tx, rp, rn, m;
	uint32_t ub;

	sx = (int64_t)(x.v - 1.0);
	tx = (int64_t)x.v;
	rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
	rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;

	/*
	* If tx >= 2^52 or tx < -2^52, then result is tx.
	* Otherwise, if sx >= 0, then result is rp.
	* Otherwise, result is rn. We use the fact that when x is
	* close to 0 (\|x\| <= 0.25) then both rp and rn are correct;
	* and if x is not close to 0, then trunc(x-1.0) yields the
	* appropriate sign.
	*/

	/*
	* Clamp rp to zero if tx < 0.
	* Clamp rn to zero if tx >= 0.
	*/
	m = sx >> 63;
	rn &= m;
	rp &= ~m;

	/*
	* Get the 12 upper bits of tx; if they are not all zeros or
	* all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
	* rp and rn to zero. Otherwise, we clamp tx to zero.
	*/
	ub = (uint32_t)((uint64_t)tx >> 52);
	m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
	rp &= m;
	rn &= m;
	tx &= ~m;

	/*
	* Only one of tx, rn or rp (at most) can be non-zero at this
	* point.
	*/
	return tx \| rn \| rp;
	}

	static inline int64_t
	fpr_floor(fpr x) {
	int64_t r;

	/*
	* The cast performs a trunc() (rounding toward 0) and thus is
	* wrong by 1 for most negative values. The correction below is
	* constant-time as long as the compiler turns the
	* floating-point conversion result into a 0/1 integer without a
	* conditional branch or another non-constant-time construction.
	* This should hold on all modern architectures with an FPU (and
	* if it is false on a given arch, then chances are that the FPU
	* itself is not constant-time, making the point moot).
	*/
	r = (int64_t)x.v;
	return r - (x.v < (double)r);
	}

	static inline int64_t
	fpr_trunc(fpr x) {
	return (int64_t)x.v;
	}

	static inline fpr
	fpr_add(fpr x, fpr y) {
	return FPR(x.v + y.v);
	}

	static inline fpr
	fpr_sub(fpr x, fpr y) {
	return FPR(x.v - y.v);
	}

	static inline fpr
	fpr_neg(fpr x) {
	return FPR(-x.v);
	}

	static inline fpr
	fpr_half(fpr x) {
	return FPR(x.v * 0.5);
	}

	static inline fpr
	fpr_double(fpr x) {
	return FPR(x.v + x.v);
	}

	static inline fpr
	fpr_mul(fpr x, fpr y) {
	return FPR(x.v * y.v);
	}

	static inline fpr
	fpr_sqr(fpr x) {
	return FPR(x.v * x.v);
	}

	static inline fpr
	fpr_inv(fpr x) {
	return FPR(1.0 / x.v);
	}

	static inline fpr
	fpr_div(fpr x, fpr y) {
	return FPR(x.v / y.v);
	}

	static inline void
	fpr_sqrt_avx2(double *t) {
	__m128d x;

	x = _mm_load1_pd(t);
	x = _mm_sqrt_pd(x);
	_mm_storel_pd(t, x);
	}

	static inline fpr
	fpr_sqrt(fpr x) {
	/*
	* We prefer not to have a dependency on libm when it can be
	* avoided. On x86, calling the sqrt() libm function inlines
	* the relevant opcode (fsqrt or sqrtsd, depending on whether
	* the 387 FPU or SSE2 is used for floating-point operations)
	* but then makes an optional call to the library function
	* for proper error handling, in case the operand is negative.
	*
	* To avoid this dependency, we use intrinsics or inline assembly
	* on recognized platforms:
	*
	* - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
	*
	* - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
	*
	* - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
	* to call the 387 FPU fsqrt opcode.
	*
	* - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
	* the fsqrt opcode (Clang needs a special hack).
	*
	* - On GCC/Clang on ARM with hardware floating-point, we use
	* inline assembly to call the vqsrt.f64 opcode. Due to a
	* complex ecosystem of compilers and assembly syntaxes, we
	* have to call it "fsqrt" or "fsqrtd", depending on case.
	*
	* If the platform is not recognized, a call to the system
	* library function sqrt() is performed. On some compilers, this
	* may actually inline the relevant opcode, and call the library
	* function only when the input is invalid (e.g. negative);
	* Falcon never actually calls sqrt() on a negative value, but
	* the dependency to libm will still be there.
	*/

	fpr_sqrt_avx2(&x.v);
	return x;
	}

	static inline int
	fpr_lt(fpr x, fpr y) {
	return x.v < y.v;
	}

	static inline uint64_t
	fpr_expm_p63(fpr x, fpr ccs) {
	/*
	* Polynomial approximation of exp(-x) is taken from FACCT:
	* https://eprint.iacr.org/2018/1234
	* Specifically, values are extracted from the implementation
	* referenced from the FACCT article, and available at:
	* https://github.com/raykzhao/gaussian
	* Tests over more than 24 billions of random inputs in the
	* 0..log(2) range have never shown a deviation larger than
	* 2^(-50) from the true mathematical value.
	*/

	/*
	* AVX2 implementation uses more operations than Horner's method,
	* but with a lower expression tree depth. This helps because
	* additions and multiplications have a latency of 4 cycles on
	* a Skylake, but the CPU can issue two of them per cycle.
	*/

	static const union {
	double d[12];
	__m256d v[3];
	} c = {
	{
	0.999999999999994892974086724280,
	0.500000000000019206858326015208,
	0.166666666666984014666397229121,
	0.041666666666110491190622155955,
	0.008333333327800835146903501993,
	0.001388888894063186997887560103,
	0.000198412739277311890541063977,
	0.000024801566833585381209939524,
	0.000002755586350219122514855659,
	0.000000275607356160477811864927,
	0.000000025299506379442070029551,
	0.000000002073772366009083061987
	}
	};

	double d1, d2, d4, d8, y;
	__m256d d14, d58, d9c;

	d1 = -x.v;
	d2 = d1 * d1;
	d4 = d2 * d2;
	d8 = d4 * d4;
	d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
	d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
	d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
	d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
	d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
	d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
	d9c = _mm256_hadd_pd(d9c, d9c);
	y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
	+ _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
	y *= ccs.v;

	/*
	* Final conversion goes through int64_t first, because that's what
	* the underlying opcode (vcvttsd2si) will do, and we know that the
	* result will fit, since x >= 0 and ccs < 1. If we did the
	* conversion directly to uint64_t, then the compiler would add some
	* extra code to cover the case of a source value of 2^63 or more,
	* and though the alternate path would never be exercised, the
	* extra comparison would cost us some cycles.
	*/
	return (uint64_t)(int64_t)(y * fpr_ptwo63.v);

	}

	#define fpr_gm_tab PQCLEAN_FALCONPADDED512_AVX2_fpr_gm_tab
	extern const fpr fpr_gm_tab[];

	#define fpr_p2_tab PQCLEAN_FALCONPADDED512_AVX2_fpr_p2_tab
	extern const fpr fpr_p2_tab[];

	/* ====================================================================== */