| #ifndef FALCON_INNER_H__ |
| #define FALCON_INNER_H__ |
| |
| /* |
| * Internal functions for Falcon. This is not the API intended to be |
| * used by applications; instead, this internal API provides all the |
| * primitives on which wrappers build to provide external APIs. |
| * |
| * ==========================(LICENSE BEGIN)============================ |
| * |
| * Copyright (c) 2017-2019 Falcon Project |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining |
| * a copy of this software and associated documentation files (the |
| * "Software"), to deal in the Software without restriction, including |
| * without limitation the rights to use, copy, modify, merge, publish, |
| * distribute, sublicense, and/or sell copies of the Software, and to |
| * permit persons to whom the Software is furnished to do so, subject to |
| * the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be |
| * included in all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
| * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
| * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| * |
| * ===========================(LICENSE END)============================= |
| * |
| * @author Thomas Pornin <[email protected]> |
| */ |
| |
| /* |
| * IMPORTANT API RULES |
| * ------------------- |
| * |
| * This API has some non-trivial usage rules: |
| * |
| * |
| * - All public functions (i.e. the non-static ones) must be referenced |
| * with the PQCLEAN_FALCONPADDED512_AVX2_ macro (e.g. PQCLEAN_FALCONPADDED512_AVX2_verify_raw for the verify_raw() |
| * function). That macro adds a prefix to the name, which is |
| * configurable with the FALCON_PREFIX macro. This allows compiling |
| * the code into a specific "namespace" and potentially including |
| * several versions of this code into a single application (e.g. to |
| * have an AVX2 and a non-AVX2 variants and select the one to use at |
| * runtime based on availability of AVX2 opcodes). |
| * |
| * - Functions that need temporary buffers expects them as a final |
| * tmp[] array of type uint8_t*, with a size which is documented for |
| * each function. However, most have some alignment requirements, |
| * because they will use the array to store 16-bit, 32-bit or 64-bit |
| * values (e.g. uint64_t or double). The caller must ensure proper |
| * alignment. What happens on unaligned access depends on the |
| * underlying architecture, ranging from a slight time penalty |
| * to immediate termination of the process. |
| * |
| * - Some functions rely on specific rounding rules and precision for |
| * floating-point numbers. On some systems (in particular 32-bit x86 |
| * with the 387 FPU), this requires setting an hardware control |
| * word. The caller MUST use set_fpu_cw() to ensure proper precision: |
| * |
| * oldcw = set_fpu_cw(2); |
| * PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(...); |
| * set_fpu_cw(oldcw); |
| * |
| * On systems where the native floating-point precision is already |
| * proper, or integer-based emulation is used, the set_fpu_cw() |
| * function does nothing, so it can be called systematically. |
| */ |
| |
| #include <stdint.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| /* |
| * This implementation uses AVX2 and optionally FMA intrinsics. |
| */ |
| #include <immintrin.h> |
| #define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c) |
| #define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c) |
| |
| /* |
| * Some computations with floating-point elements, in particular |
| * rounding to the nearest integer, rely on operations using _exactly_ |
| * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit |
| * x86, the 387 FPU may be used (depending on the target OS) and, in |
| * that case, may use more precision bits (i.e. 64 bits, for an 80-bit |
| * total type length); to prevent miscomputations, we define an explicit |
| * function that modifies the precision in the FPU control word. |
| * |
| * set_fpu_cw() sets the precision to the provided value, and returns |
| * the previously set precision; callers are supposed to restore the |
| * previous precision on exit. The correct (52-bit) precision is |
| * configured with the value "2". On unsupported compilers, or on |
| * targets other than 32-bit x86, or when the native 'double' type is |
| * not used, the set_fpu_cw() function does nothing at all. |
| */ |
| static inline unsigned |
| set_fpu_cw(unsigned x) { |
| return x; |
| } |
| |
| /* ==================================================================== */ |
| /* |
| * SHAKE256 implementation (shake.c). |
| * |
| * API is defined to be easily replaced with the fips202.h API defined |
| * as part of PQClean. |
| */ |
| |
| #include "fips202.h" |
| |
| #define inner_shake256_context shake256incctx |
| #define inner_shake256_init(sc) shake256_inc_init(sc) |
| #define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) |
| #define inner_shake256_flip(sc) shake256_inc_finalize(sc) |
| #define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) |
| #define inner_shake256_ctx_release(sc) shake256_inc_ctx_release(sc) |
| |
| /* ==================================================================== */ |
| /* |
| * Encoding/decoding functions (codec.c). |
| * |
| * Encoding functions take as parameters an output buffer (out) with |
| * a given maximum length (max_out_len); returned value is the actual |
| * number of bytes which have been written. If the output buffer is |
| * not large enough, then 0 is returned (some bytes may have been |
| * written to the buffer). If 'out' is NULL, then 'max_out_len' is |
| * ignored; instead, the function computes and returns the actual |
| * required output length (in bytes). |
| * |
| * Decoding functions take as parameters an input buffer (in) with |
| * its maximum length (max_in_len); returned value is the actual number |
| * of bytes that have been read from the buffer. If the provided length |
| * is too short, then 0 is returned. |
| * |
| * Values to encode or decode are vectors of integers, with N = 2^logn |
| * elements. |
| * |
| * Three encoding formats are defined: |
| * |
| * - modq: sequence of values modulo 12289, each encoded over exactly |
| * 14 bits. The encoder and decoder verify that integers are within |
| * the valid range (0..12288). Values are arrays of uint16. |
| * |
| * - trim: sequence of signed integers, a specified number of bits |
| * each. The number of bits is provided as parameter and includes |
| * the sign bit. Each integer x must be such that |x| < 2^(bits-1) |
| * (which means that the -2^(bits-1) value is forbidden); encode and |
| * decode functions check that property. Values are arrays of |
| * int16_t or int8_t, corresponding to names 'trim_i16' and |
| * 'trim_i8', respectively. |
| * |
| * - comp: variable-length encoding for signed integers; each integer |
| * uses a minimum of 9 bits, possibly more. This is normally used |
| * only for signatures. |
| * |
| */ |
| |
| size_t PQCLEAN_FALCONPADDED512_AVX2_modq_encode(void *out, size_t max_out_len, |
| const uint16_t *x, unsigned logn); |
| size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i16_encode(void *out, size_t max_out_len, |
| const int16_t *x, unsigned logn, unsigned bits); |
| size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(void *out, size_t max_out_len, |
| const int8_t *x, unsigned logn, unsigned bits); |
| size_t PQCLEAN_FALCONPADDED512_AVX2_comp_encode(void *out, size_t max_out_len, |
| const int16_t *x, unsigned logn); |
| |
| size_t PQCLEAN_FALCONPADDED512_AVX2_modq_decode(uint16_t *x, unsigned logn, |
| const void *in, size_t max_in_len); |
| size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits, |
| const void *in, size_t max_in_len); |
| size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits, |
| const void *in, size_t max_in_len); |
| size_t PQCLEAN_FALCONPADDED512_AVX2_comp_decode(int16_t *x, unsigned logn, |
| const void *in, size_t max_in_len); |
| |
| /* |
| * Number of bits for key elements, indexed by logn (1 to 10). This |
| * is at most 8 bits for all degrees, but some degrees may have shorter |
| * elements. |
| */ |
| extern const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[]; |
| extern const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[]; |
| |
| /* |
| * Maximum size, in bits, of elements in a signature, indexed by logn |
| * (1 to 10). The size includes the sign bit. |
| */ |
| extern const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_sig_bits[]; |
| |
| /* ==================================================================== */ |
| /* |
| * Support functions used for both signature generation and signature |
| * verification (common.c). |
| */ |
| |
| /* |
| * From a SHAKE256 context (must be already flipped), produce a new |
| * point. This is the non-constant-time version, which may leak enough |
| * information to serve as a stop condition on a brute force attack on |
| * the hashed message (provided that the nonce value is known). |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_vartime(inner_shake256_context *sc, |
| uint16_t *x, unsigned logn); |
| |
| /* |
| * From a SHAKE256 context (must be already flipped), produce a new |
| * point. The temporary buffer (tmp) must have room for 2*2^logn bytes. |
| * This function is constant-time but is typically more expensive than |
| * PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_vartime(). |
| * |
| * tmp[] must have 16-bit alignment. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_ct(inner_shake256_context *sc, |
| uint16_t *x, unsigned logn, uint8_t *tmp); |
| |
| /* |
| * Tell whether a given vector (2N coordinates, in two halves) is |
| * acceptable as a signature. This compares the appropriate norm of the |
| * vector with the acceptance bound. Returned value is 1 on success |
| * (vector is short enough to be acceptable), 0 otherwise. |
| */ |
| int PQCLEAN_FALCONPADDED512_AVX2_is_short(const int16_t *s1, const int16_t *s2, unsigned logn); |
| |
| /* |
| * Tell whether a given vector (2N coordinates, in two halves) is |
| * acceptable as a signature. Instead of the first half s1, this |
| * function receives the "saturated squared norm" of s1, i.e. the |
| * sum of the squares of the coordinates of s1 (saturated at 2^32-1 |
| * if the sum exceeds 2^31-1). |
| * |
| * Returned value is 1 on success (vector is short enough to be |
| * acceptable), 0 otherwise. |
| */ |
| int PQCLEAN_FALCONPADDED512_AVX2_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn); |
| |
| /* ==================================================================== */ |
| /* |
| * Signature verification functions (vrfy.c). |
| */ |
| |
| /* |
| * Convert a public key to NTT + Montgomery format. Conversion is done |
| * in place. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_to_ntt_monty(uint16_t *h, unsigned logn); |
| |
| /* |
| * Internal signature verification code: |
| * c0[] contains the hashed nonce+message |
| * s2[] is the decoded signature |
| * h[] contains the public key, in NTT + Montgomery format |
| * logn is the degree log |
| * tmp[] temporary, must have at least 2*2^logn bytes |
| * Returned value is 1 on success, 0 on error. |
| * |
| * tmp[] must have 16-bit alignment. |
| */ |
| int PQCLEAN_FALCONPADDED512_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2, |
| const uint16_t *h, unsigned logn, uint8_t *tmp); |
| |
| /* |
| * Compute the public key h[], given the private key elements f[] and |
| * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial |
| * modulus. This function returns 1 on success, 0 on error (an error is |
| * reported if f is not invertible mod phi mod q). |
| * |
| * The tmp[] array must have room for at least 2*2^logn elements. |
| * tmp[] must have 16-bit alignment. |
| */ |
| int PQCLEAN_FALCONPADDED512_AVX2_compute_public(uint16_t *h, |
| const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp); |
| |
| /* |
| * Recompute the fourth private key element. Private key consists in |
| * four polynomials with small coefficients f, g, F and G, which are |
| * such that fG - gF = q mod phi; furthermore, f is invertible modulo |
| * phi and modulo q. This function recomputes G from f, g and F. |
| * |
| * The tmp[] array must have room for at least 4*2^logn bytes. |
| * |
| * Returned value is 1 in success, 0 on error (f not invertible). |
| * tmp[] must have 16-bit alignment. |
| */ |
| int PQCLEAN_FALCONPADDED512_AVX2_complete_private(int8_t *G, |
| const int8_t *f, const int8_t *g, const int8_t *F, |
| unsigned logn, uint8_t *tmp); |
| |
| /* |
| * Test whether a given polynomial is invertible modulo phi and q. |
| * Polynomial coefficients are small integers. |
| * |
| * tmp[] must have 16-bit alignment. |
| */ |
| int PQCLEAN_FALCONPADDED512_AVX2_is_invertible( |
| const int16_t *s2, unsigned logn, uint8_t *tmp); |
| |
| /* |
| * Count the number of elements of value zero in the NTT representation |
| * of the given polynomial: this is the number of primitive 2n-th roots |
| * of unity (modulo q = 12289) that are roots of the provided polynomial |
| * (taken modulo q). |
| * |
| * tmp[] must have 16-bit alignment. |
| */ |
| int PQCLEAN_FALCONPADDED512_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp); |
| |
| /* |
| * Internal signature verification with public key recovery: |
| * h[] receives the public key (NOT in NTT/Montgomery format) |
| * c0[] contains the hashed nonce+message |
| * s1[] is the first signature half |
| * s2[] is the second signature half |
| * logn is the degree log |
| * tmp[] temporary, must have at least 2*2^logn bytes |
| * Returned value is 1 on success, 0 on error. Success is returned if |
| * the signature is a short enough vector; in that case, the public |
| * key has been written to h[]. However, the caller must still |
| * verify that h[] is the correct value (e.g. with regards to a known |
| * hash of the public key). |
| * |
| * h[] may not overlap with any of the other arrays. |
| * |
| * tmp[] must have 16-bit alignment. |
| */ |
| int PQCLEAN_FALCONPADDED512_AVX2_verify_recover(uint16_t *h, |
| const uint16_t *c0, const int16_t *s1, const int16_t *s2, |
| unsigned logn, uint8_t *tmp); |
| |
| /* ==================================================================== */ |
| /* |
| * Implementation of floating-point real numbers (fpr.h, fpr.c). |
| */ |
| |
| /* |
| * Real numbers are implemented by an extra header file, included below. |
| * This is meant to support pluggable implementations. The default |
| * implementation relies on the C type 'double'. |
| * |
| * The included file must define the following types, functions and |
| * constants: |
| * |
| * fpr |
| * type for a real number |
| * |
| * fpr fpr_of(int64_t i) |
| * cast an integer into a real number; source must be in the |
| * -(2^63-1)..+(2^63-1) range |
| * |
| * fpr fpr_scaled(int64_t i, int sc) |
| * compute i*2^sc as a real number; source 'i' must be in the |
| * -(2^63-1)..+(2^63-1) range |
| * |
| * fpr fpr_ldexp(fpr x, int e) |
| * compute x*2^e |
| * |
| * int64_t fpr_rint(fpr x) |
| * round x to the nearest integer; x must be in the -(2^63-1) |
| * to +(2^63-1) range |
| * |
| * int64_t fpr_trunc(fpr x) |
| * round to an integer; this rounds towards zero; value must |
| * be in the -(2^63-1) to +(2^63-1) range |
| * |
| * fpr fpr_add(fpr x, fpr y) |
| * compute x + y |
| * |
| * fpr fpr_sub(fpr x, fpr y) |
| * compute x - y |
| * |
| * fpr fpr_neg(fpr x) |
| * compute -x |
| * |
| * fpr fpr_half(fpr x) |
| * compute x/2 |
| * |
| * fpr fpr_double(fpr x) |
| * compute x*2 |
| * |
| * fpr fpr_mul(fpr x, fpr y) |
| * compute x * y |
| * |
| * fpr fpr_sqr(fpr x) |
| * compute x * x |
| * |
| * fpr fpr_inv(fpr x) |
| * compute 1/x |
| * |
| * fpr fpr_div(fpr x, fpr y) |
| * compute x/y |
| * |
| * fpr fpr_sqrt(fpr x) |
| * compute the square root of x |
| * |
| * int fpr_lt(fpr x, fpr y) |
| * return 1 if x < y, 0 otherwise |
| * |
| * uint64_t fpr_expm_p63(fpr x) |
| * return exp(x), assuming that 0 <= x < log(2). Returned value |
| * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x), |
| * rounded to the nearest integer). Computation should have a |
| * precision of at least 45 bits. |
| * |
| * const fpr fpr_gm_tab[] |
| * array of constants for FFT / iFFT |
| * |
| * const fpr fpr_p2_tab[] |
| * precomputed powers of 2 (by index, 0 to 10) |
| * |
| * Constants of type 'fpr': |
| * |
| * fpr fpr_q 12289 |
| * fpr fpr_inverse_of_q 1/12289 |
| * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2)) |
| * fpr fpr_inv_sigma[] 1/sigma (indexed by logn, 1 to 10) |
| * fpr fpr_sigma_min[] 1/sigma_min (indexed by logn, 1 to 10) |
| * fpr fpr_log2 log(2) |
| * fpr fpr_inv_log2 1/log(2) |
| * fpr fpr_bnorm_max 16822.4121 |
| * fpr fpr_zero 0 |
| * fpr fpr_one 1 |
| * fpr fpr_two 2 |
| * fpr fpr_onehalf 0.5 |
| * fpr fpr_ptwo31 2^31 |
| * fpr fpr_ptwo31m1 2^31-1 |
| * fpr fpr_mtwo31m1 -(2^31-1) |
| * fpr fpr_ptwo63m1 2^63-1 |
| * fpr fpr_mtwo63m1 -(2^63-1) |
| * fpr fpr_ptwo63 2^63 |
| */ |
| #include "fpr.h" |
| |
| /* ==================================================================== */ |
| /* |
| * RNG (rng.c). |
| * |
| * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256 |
| * context (flipped) and is used for bulk pseudorandom generation. |
| * A system-dependent seed generator is also provided. |
| */ |
| |
| /* |
| * Obtain a random seed from the system RNG. |
| * |
| * Returned value is 1 on success, 0 on error. |
| */ |
| int PQCLEAN_FALCONPADDED512_AVX2_get_seed(void *seed, size_t seed_len); |
| |
| /* |
| * Structure for a PRNG. This includes a large buffer so that values |
| * get generated in advance. The 'state' is used to keep the current |
| * PRNG algorithm state (contents depend on the selected algorithm). |
| * |
| * The unions with 'dummy_u64' are there to ensure proper alignment for |
| * 64-bit direct access. |
| */ |
| typedef struct { |
| union { |
| uint8_t d[512]; /* MUST be 512, exactly */ |
| uint64_t dummy_u64; |
| } buf; |
| size_t ptr; |
| union { |
| uint8_t d[256]; |
| uint64_t dummy_u64; |
| } state; |
| int type; |
| } prng; |
| |
| /* |
| * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256 |
| * context (in "flipped" state) to obtain its initial state. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_prng_init(prng *p, inner_shake256_context *src); |
| |
| /* |
| * Refill the PRNG buffer. This is normally invoked automatically, and |
| * is declared here only so that prng_get_u64() may be inlined. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_prng_refill(prng *p); |
| |
| /* |
| * Get some bytes from a PRNG. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_prng_get_bytes(prng *p, void *dst, size_t len); |
| |
| /* |
| * Get a 64-bit random value from a PRNG. |
| */ |
| static inline uint64_t |
| prng_get_u64(prng *p) { |
| size_t u; |
| |
| /* |
| * If there are less than 9 bytes in the buffer, we refill it. |
| * This means that we may drop the last few bytes, but this allows |
| * for faster extraction code. Also, it means that we never leave |
| * an empty buffer. |
| */ |
| u = p->ptr; |
| if (u >= (sizeof p->buf.d) - 9) { |
| PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p); |
| u = 0; |
| } |
| p->ptr = u + 8; |
| |
| return (uint64_t)p->buf.d[u + 0] |
| | ((uint64_t)p->buf.d[u + 1] << 8) |
| | ((uint64_t)p->buf.d[u + 2] << 16) |
| | ((uint64_t)p->buf.d[u + 3] << 24) |
| | ((uint64_t)p->buf.d[u + 4] << 32) |
| | ((uint64_t)p->buf.d[u + 5] << 40) |
| | ((uint64_t)p->buf.d[u + 6] << 48) |
| | ((uint64_t)p->buf.d[u + 7] << 56); |
| } |
| |
| /* |
| * Get an 8-bit random value from a PRNG. |
| */ |
| static inline unsigned |
| prng_get_u8(prng *p) { |
| unsigned v; |
| |
| v = p->buf.d[p->ptr ++]; |
| if (p->ptr == sizeof p->buf.d) { |
| PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p); |
| } |
| return v; |
| } |
| |
| /* ==================================================================== */ |
| /* |
| * FFT (falcon-fft.c). |
| * |
| * A real polynomial is represented as an array of N 'fpr' elements. |
| * The FFT representation of a real polynomial contains N/2 complex |
| * elements; each is stored as two real numbers, for the real and |
| * imaginary parts, respectively. See falcon-fft.c for details on the |
| * internal representation. |
| */ |
| |
| /* |
| * Compute FFT in-place: the source array should contain a real |
| * polynomial (N coefficients); its storage area is reused to store |
| * the FFT representation of that polynomial (N/2 complex numbers). |
| * |
| * 'logn' MUST lie between 1 and 10 (inclusive). |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_FFT(fpr *f, unsigned logn); |
| |
| /* |
| * Compute the inverse FFT in-place: the source array should contain the |
| * FFT representation of a real polynomial (N/2 elements); the resulting |
| * real polynomial (N coefficients of type 'fpr') is written over the |
| * array. |
| * |
| * 'logn' MUST lie between 1 and 10 (inclusive). |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_iFFT(fpr *f, unsigned logn); |
| |
| /* |
| * Add polynomial b to polynomial a. a and b MUST NOT overlap. This |
| * function works in both normal and FFT representations. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_add(fpr *a, const fpr *b, unsigned logn); |
| |
| /* |
| * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This |
| * function works in both normal and FFT representations. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_sub(fpr *a, const fpr *b, unsigned logn); |
| |
| /* |
| * Negate polynomial a. This function works in both normal and FFT |
| * representations. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_neg(fpr *a, unsigned logn); |
| |
| /* |
| * Compute adjoint of polynomial a. This function works only in FFT |
| * representation. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(fpr *a, unsigned logn); |
| |
| /* |
| * Multiply polynomial a with polynomial b. a and b MUST NOT overlap. |
| * This function works only in FFT representation. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(fpr *a, const fpr *b, unsigned logn); |
| |
| /* |
| * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT |
| * overlap. This function works only in FFT representation. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn); |
| |
| /* |
| * Multiply polynomial with its own adjoint. This function works only in FFT |
| * representation. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn); |
| |
| /* |
| * Multiply polynomial with a real constant. This function works in both |
| * normal and FFT representations. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn); |
| |
| /* |
| * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation). |
| * a and b MUST NOT overlap. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_div_fft(fpr *a, const fpr *b, unsigned logn); |
| |
| /* |
| * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g)) |
| * (also in FFT representation). Since the result is auto-adjoint, all its |
| * coordinates in FFT representation are real; as such, only the first N/2 |
| * values of d[] are filled (the imaginary parts are skipped). |
| * |
| * Array d MUST NOT overlap with either a or b. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(fpr *d, |
| const fpr *a, const fpr *b, unsigned logn); |
| |
| /* |
| * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g) |
| * (also in FFT representation). Destination d MUST NOT overlap with |
| * any of the source arrays. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_add_muladj_fft(fpr *d, |
| const fpr *F, const fpr *G, |
| const fpr *f, const fpr *g, unsigned logn); |
| |
| /* |
| * Multiply polynomial a by polynomial b, where b is autoadjoint. Both |
| * a and b are in FFT representation. Since b is autoadjoint, all its |
| * FFT coefficients are real, and the array b contains only N/2 elements. |
| * a and b MUST NOT overlap. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(fpr *a, |
| const fpr *b, unsigned logn); |
| |
| /* |
| * Divide polynomial a by polynomial b, where b is autoadjoint. Both |
| * a and b are in FFT representation. Since b is autoadjoint, all its |
| * FFT coefficients are real, and the array b contains only N/2 elements. |
| * a and b MUST NOT overlap. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_div_autoadj_fft(fpr *a, |
| const fpr *b, unsigned logn); |
| |
| /* |
| * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT |
| * representation. On input, g00, g01 and g11 are provided (where the |
| * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10 |
| * and d11 values are written in g00, g01 and g11, respectively |
| * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]). |
| * (In fact, d00 = g00, so the g00 operand is left unmodified.) |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_LDL_fft(const fpr *g00, |
| fpr *g01, fpr *g11, unsigned logn); |
| |
| /* |
| * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT |
| * representation. This is identical to poly_LDL_fft() except that |
| * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written |
| * in two other separate buffers provided as extra parameters. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_LDLmv_fft(fpr *d11, fpr *l10, |
| const fpr *g00, const fpr *g01, |
| const fpr *g11, unsigned logn); |
| |
| /* |
| * Apply "split" operation on a polynomial in FFT representation: |
| * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1 |
| * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(fpr *f0, fpr *f1, |
| const fpr *f, unsigned logn); |
| |
| /* |
| * Apply "merge" operation on two polynomials in FFT representation: |
| * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes |
| * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1. |
| * f MUST NOT overlap with either f0 or f1. |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(fpr *f, |
| const fpr *f0, const fpr *f1, unsigned logn); |
| |
| /* ==================================================================== */ |
| /* |
| * Key pair generation. |
| */ |
| |
| /* |
| * Required sizes of the temporary buffer (in bytes). |
| * |
| * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1 |
| * or 2) where it is slightly greater. |
| */ |
| #define FALCON_KEYGEN_TEMP_1 136 |
| #define FALCON_KEYGEN_TEMP_2 272 |
| #define FALCON_KEYGEN_TEMP_3 224 |
| #define FALCON_KEYGEN_TEMP_4 448 |
| #define FALCON_KEYGEN_TEMP_5 896 |
| #define FALCON_KEYGEN_TEMP_6 1792 |
| #define FALCON_KEYGEN_TEMP_7 3584 |
| #define FALCON_KEYGEN_TEMP_8 7168 |
| #define FALCON_KEYGEN_TEMP_9 14336 |
| #define FALCON_KEYGEN_TEMP_10 28672 |
| |
| /* |
| * Generate a new key pair. Randomness is extracted from the provided |
| * SHAKE256 context, which must have already been seeded and flipped. |
| * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_* |
| * macros) and be aligned for the uint32_t, uint64_t and fpr types. |
| * |
| * The private key elements are written in f, g, F and G, and the |
| * public key is written in h. Either or both of G and h may be NULL, |
| * in which case the corresponding element is not returned (they can |
| * be recomputed from f, g and F). |
| * |
| * tmp[] must have 64-bit alignment. |
| * This function uses floating-point rounding (see set_fpu_cw()). |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_keygen(inner_shake256_context *rng, |
| int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, |
| unsigned logn, uint8_t *tmp); |
| |
| /* ==================================================================== */ |
| /* |
| * Signature generation. |
| */ |
| |
| /* |
| * Expand a private key into the B0 matrix in FFT representation and |
| * the LDL tree. All the values are written in 'expanded_key', for |
| * a total of (8*logn+40)*2^logn bytes. |
| * |
| * The tmp[] array must have room for at least 48*2^logn bytes. |
| * |
| * tmp[] must have 64-bit alignment. |
| * This function uses floating-point rounding (see set_fpu_cw()). |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_expand_privkey(fpr *expanded_key, |
| const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, |
| unsigned logn, uint8_t *tmp); |
| |
| /* |
| * Compute a signature over the provided hashed message (hm); the |
| * signature value is one short vector. This function uses an |
| * expanded key (as generated by PQCLEAN_FALCONPADDED512_AVX2_expand_privkey()). |
| * |
| * The sig[] and hm[] buffers may overlap. |
| * |
| * On successful output, the start of the tmp[] buffer contains the s1 |
| * vector (as int16_t elements). |
| * |
| * The minimal size (in bytes) of tmp[] is 48*2^logn bytes. |
| * |
| * tmp[] must have 64-bit alignment. |
| * This function uses floating-point rounding (see set_fpu_cw()). |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng, |
| const fpr *expanded_key, |
| const uint16_t *hm, unsigned logn, uint8_t *tmp); |
| |
| /* |
| * Compute a signature over the provided hashed message (hm); the |
| * signature value is one short vector. This function uses a raw |
| * key and dynamically recompute the B0 matrix and LDL tree; this |
| * saves RAM since there is no needed for an expanded key, but |
| * increases the signature cost. |
| * |
| * The sig[] and hm[] buffers may overlap. |
| * |
| * On successful output, the start of the tmp[] buffer contains the s1 |
| * vector (as int16_t elements). |
| * |
| * The minimal size (in bytes) of tmp[] is 72*2^logn bytes. |
| * |
| * tmp[] must have 64-bit alignment. |
| * This function uses floating-point rounding (see set_fpu_cw()). |
| */ |
| void PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng, |
| const int8_t *f, const int8_t *g, |
| const int8_t *F, const int8_t *G, |
| const uint16_t *hm, unsigned logn, uint8_t *tmp); |
| |
| /* |
| * Internal sampler engine. Exported for tests. |
| * |
| * sampler_context wraps around a source of random numbers (PRNG) and |
| * the sigma_min value (nominally dependent on the degree). |
| * |
| * sampler() takes as parameters: |
| * ctx pointer to the sampler_context structure |
| * mu center for the distribution |
| * isigma inverse of the distribution standard deviation |
| * It returns an integer sampled along the Gaussian distribution centered |
| * on mu and of standard deviation sigma = 1/isigma. |
| * |
| * gaussian0_sampler() takes as parameter a pointer to a PRNG, and |
| * returns an integer sampled along a half-Gaussian with standard |
| * deviation sigma0 = 1.8205 (center is 0, returned value is |
| * nonnegative). |
| */ |
| |
| typedef struct { |
| prng p; |
| fpr sigma_min; |
| } sampler_context; |
| |
| int PQCLEAN_FALCONPADDED512_AVX2_sampler(void *ctx, fpr mu, fpr isigma); |
| |
| int PQCLEAN_FALCONPADDED512_AVX2_gaussian0_sampler(prng *p); |
| |
| /* ==================================================================== */ |
| |
| #endif |