src/sig/falcon/pqclean_falcon-padded-512_aarch64/sign.c - platform/external/liboqs - Git at Google

 /*
  * Falcon signature generation.
  *
  * ==========================(LICENSE BEGIN)============================
  *
  * Copyright (c) 2017-2019  Falcon Project
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * ===========================(LICENSE END)=============================
  *
  * @author   Thomas Pornin <[email protected]>
  */

 #include "inner.h"
 #include "macrof.h"
 #include "macrofx4.h"
 #include "util.h"
 #include <arm_neon.h>
 #include <stdio.h>
 /* =================================================================== */

 /*
  * Compute degree N from logarithm 'logn'.
  */
 #define MKN(logn)   ((size_t)1 << (logn))

 /* =================================================================== */
 /*
  * Binary case:
  *   N = 2^logn
  *   phi = X^N+1
  */

 /*
  * Get the size of the LDL tree for an input with polynomials of size
  * 2^logn. The size is expressed in the number of elements.
  */
 static inline unsigned
 ffLDL_treesize(unsigned logn) {
     /*
      * For logn = 0 (polynomials are constant), the "tree" is a
      * single element. Otherwise, the tree node has size 2^logn, and
      * has two child trees for size logn-1 each. Thus, treesize s()
      * must fulfill these two relations:
      *
      *   s(0) = 1
      *   s(logn) = (2^logn) + 2*s(logn-1)
      */
     return (logn + 1) << logn;
 }

 /*
  * Inner function for ffLDL_fft(). It expects the matrix to be both
  * auto-adjoint and quasicyclic; also, it uses the source operands
  * as modifiable temporaries.
  *
  * tmp[] must have room for at least one polynomial.
  */
 static void
 ffLDL_fft_inner(fpr *restrict tree,
                 fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp) {
     size_t n, hn;

     n = MKN(logn);
     if (n == 1) {
         tree[0] = g0[0];
         return;
     }
     hn = n >> 1;

     /*
      * The LDL decomposition yields L (which is written in the tree)
      * and the diagonal of D. Since d00 = g0, we just write d11
      * into tmp.
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);

     /*
      * Split d00 (currently in g0) and d11 (currently in tmp). We
      * reuse g0 and g1 as temporary storage spaces:
      *   d00 splits into g1, g1+hn
      *   d11 splits into g0, g0+hn
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(g1, g1 + hn, g0, logn);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(g0, g0 + hn, tmp, logn);

     /*
      * Each split result is the first row of a new auto-adjoint
      * quasicyclic matrix for the next recursive step.
      */
     ffLDL_fft_inner(tree + n,
                     g1, g1 + hn, logn - 1, tmp);
     ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
                     g0, g0 + hn, logn - 1, tmp);
 }

 /*
  * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
  * is provided as three polynomials (FFT representation).
  *
  * The "tree" array is filled with the computed tree, of size
  * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
  *
  * Input arrays MUST NOT overlap, except possibly the three unmodified
  * arrays g00, g01 and g11. tmp[] should have room for at least three
  * polynomials of 2^logn elements each.
  */
 static void
 ffLDL_fft(fpr *restrict tree, const fpr *restrict g00,
           const fpr *restrict g01, const fpr *restrict g11,
           unsigned logn, fpr *restrict tmp) {
     size_t n, hn;
     fpr *d00, *d11;

     n = MKN(logn);
     if (n == 1) {
         tree[0] = g00[0];
         return;
     }
     hn = n >> 1;
     d00 = tmp;
     d11 = tmp + n;
     tmp += n << 1;

     memcpy(d00, g00, n * sizeof * g00);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(tmp, tmp + hn, d00, logn);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(d00, d00 + hn, d11, logn);
     memcpy(d11, tmp, n * sizeof * tmp);

     ffLDL_fft_inner(tree + n, d11, d11 + hn, logn - 1, tmp);
     ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), d00, d00 + hn, logn - 1, tmp);

 }

 /*
  * Normalize an ffLDL tree: each leaf of value x is replaced with
  * sigma / sqrt(x).
  */
 static void
 ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
     /*
      * TODO: make an iterative version.
      */
     size_t n;

     n = MKN(logn);
     if (n == 1) {
         /*
          * We actually store in the tree leaf the inverse of
          * the value mandated by the specification: this
          * saves a division both here and in the sampler.
          */
         tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma_9);
     } else {
         ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
         ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
                                orig_logn, logn - 1);
     }
 }

 /* =================================================================== */

 /*
  * The expanded private key contains:
  *  - The B0 matrix (four elements)
  *  - The ffLDL tree
  */

 static inline size_t
 skoff_b00(unsigned logn) {
     (void)logn;
     return 0;
 }

 static inline size_t
 skoff_b01(unsigned logn) {
     return MKN(logn);
 }

 static inline size_t
 skoff_b10(unsigned logn) {
     return 2 * MKN(logn);
 }

 static inline size_t
 skoff_b11(unsigned logn) {
     return 3 * MKN(logn);
 }

 static inline size_t
 skoff_tree(unsigned logn) {
     return 4 * MKN(logn);
 }

 /* see inner.h */
 void
 PQCLEAN_FALCONPADDED512_AARCH64_expand_privkey(fpr *restrict expanded_key,
         const int8_t *f, const int8_t *g,
         const int8_t *F, const int8_t *G,
         uint8_t *restrict tmp) {
     fpr *rf, *rg, *rF, *rG;
     fpr *b00, *b01, *b10, *b11;
     fpr *g00, *g01, *g11, *gxx;
     fpr *tree;

     b00 = expanded_key + skoff_b00(FALCON_LOGN);
     b01 = expanded_key + skoff_b01(FALCON_LOGN);
     b10 = expanded_key + skoff_b10(FALCON_LOGN);
     b11 = expanded_key + skoff_b11(FALCON_LOGN);
     tree = expanded_key + skoff_tree(FALCON_LOGN);

     /*
      * We load the private key elements directly into the B0 matrix,
      * since B0 = [[g, -f], [G, -F]].
      */
     rg = b00;
     rf = b01;
     rG = b10;
     rF = b11;

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rg, g, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(rg, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rf, f, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(rf, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(rf, rf, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rG, G, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(rG, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rF, F, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(rF, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(rF, rF, FALCON_LOGN);

     /*
      * Compute the FFT for the key elements, and negate f and F.
      */

     /*
      * The Gram matrix is G = B·B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)
      *   g11 = b10*adj(b10) + b11*adj(b11)
      *
      * For historical reasons, this implementation uses
      * g00, g01 and g11 (upper triangle).
      */
     g00 = (fpr *)tmp;
     g01 = g00 + FALCON_N;
     g11 = g01 + FALCON_N;
     gxx = g11 + FALCON_N;

     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(g00, b00, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(g00, g00, b01, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(g01, b00, b10, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(g01, g01, b01, b11, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(g11, b10, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(g11, g11, b11, FALCON_LOGN);

     /*
      * Compute the Falcon tree.
      */
     ffLDL_fft(tree, g00, g01, g11, FALCON_LOGN, gxx);

     /*
      * Normalize tree.
      */
     ffLDL_binary_normalize(tree, FALCON_LOGN, FALCON_LOGN);
 }

 typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);

 /*
  * Perform Fast Fourier Sampling for target vector t. The Gram matrix
  * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
  * is written over (t0,t1). The Gram matrix is modified as well. The
  * tmp[] buffer must have room for four polynomials.
  */
 static void
 ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
                        fpr *restrict t0, fpr *restrict t1,
                        fpr *restrict g00, fpr *restrict g01, fpr *restrict g11,
                        unsigned orig_logn, unsigned logn, fpr *restrict tmp) {
     size_t n, hn;
     fpr *z0, *z1;

     /*
      * Deepest level: the LDL tree leaf value is just g00 (the
      * array has length only 1 at this point); we normalize it
      * with regards to sigma, then use it for sampling.
      */
     if (logn == 0) {
         fpr leaf;

         leaf = g00[0];
         leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma_9);
         t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
         t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
         return;
     }

     n = (size_t)1 << logn;
     hn = n >> 1;

     /*
      * Decompose G into LDL. We only need d00 (identical to g00),
      * d11, and l10; we do that in place.
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft(g00, g01, g11, logn);

     /*
      * Split d00 and d11 and expand them into half-size quasi-cyclic
      * Gram matrices. We also save l10 in tmp[].
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(tmp, tmp + hn, g00, logn);
     memcpy(g00, tmp, n * sizeof * tmp);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(tmp, tmp + hn, g11, logn);
     memcpy(g11, tmp, n * sizeof * tmp);
     memcpy(tmp, g01, n * sizeof * g01);
     memcpy(g01, g00, hn * sizeof * g00);
     memcpy(g01 + hn, g11, hn * sizeof * g00);

     /*
      * The half-size Gram matrices for the recursive LDL tree
      * building are now:
      *   - left sub-tree: g00, g00+hn, g01
      *   - right sub-tree: g11, g11+hn, g01+hn
      * l10 is in tmp[].
      */

     /*
      * We split t1 and use the first recursive call on the two
      * halves, using the right sub-tree. The result is merged
      * back into tmp + 2*n.
      */
     z1 = tmp + n;
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
     ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
                            g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);

     /*
      * Compute tb0 = t0 + (t1 - z1) * l10.
      * At that point, l10 is in tmp, t1 is unmodified, and z1 is
      * in tmp + (n << 1). The buffer in z1 is free.
      *
      * In the end, z1 is written over t1, and tb0 is in t0.
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(z1, t1, tmp + (n << 1), logn);
     memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t0, t0, tmp, z1, logn);

     /*
      * Second recursive invocation, on the split tb0 (currently in t0)
      * and the left sub-tree.
      */
     z0 = tmp;
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z0, z0 + hn, t0, logn);
     ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
                            g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(t0, z0, z0 + hn, logn);
 }

 /*
  * Perform Fast Fourier Sampling for target vector t and LDL tree T.
  * tmp[] must have size for at least two polynomials of size 2^logn.
  */
 static void
 ffSampling_fft(samplerZ samp, void *samp_ctx,
                fpr *restrict z0, fpr *restrict z1,
                const fpr *restrict tree,
                const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
                fpr *restrict tmp) {
     size_t n, hn;
     const fpr *tree0, *tree1;

     /*
      * When logn == 2, we inline the last two recursion levels.
      */
     if (logn == 2) {
         fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
         fpr a_re, a_im, b_re, b_im, c_re, c_im;

         tree0 = tree + 4;
         tree1 = tree + 8;

         /*
          * We split t1 into w*, then do the recursive invocation,
          * with output in w*. We finally merge back into z1.
          */
         // Split
         a_re = t1[0];
         a_im = t1[2];
         b_re = t1[1];
         b_im = t1[3];
         c_re = fpr_add(a_re, b_re);
         c_im = fpr_add(a_im, b_im);
         w0 = fpr_half(c_re);
         w1 = fpr_half(c_im);
         c_re = fpr_sub(a_re, b_re);
         c_im = fpr_sub(a_im, b_im);
         w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
         w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);

         // Sampling
         x0 = w2;
         x1 = w3;
         sigma = tree1[3];
         w2 = fpr_of(samp(samp_ctx, x0, sigma));
         w3 = fpr_of(samp(samp_ctx, x1, sigma));
         a_re = fpr_sub(x0, w2);
         a_im = fpr_sub(x1, w3);
         b_re = tree1[0];
         b_im = tree1[1];
         c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
         c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
         x0 = fpr_add(c_re, w0);
         x1 = fpr_add(c_im, w1);
         sigma = tree1[2];
         w0 = fpr_of(samp(samp_ctx, x0, sigma));
         w1 = fpr_of(samp(samp_ctx, x1, sigma));

         // Merge
         a_re = w0;
         a_im = w1;
         b_re = w2;
         b_im = w3;
         c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
         c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
         z1[0] = w0 = fpr_add(a_re, c_re);
         z1[2] = w2 = fpr_add(a_im, c_im);
         z1[1] = w1 = fpr_sub(a_re, c_re);
         z1[3] = w3 = fpr_sub(a_im, c_im);

         /*
          * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
          */
         w0 = fpr_sub(t1[0], w0);
         w1 = fpr_sub(t1[1], w1);
         w2 = fpr_sub(t1[2], w2);
         w3 = fpr_sub(t1[3], w3);

         a_re = w0;
         a_im = w2;
         b_re = tree[0];
         b_im = tree[2];
         w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
         w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
         a_re = w1;
         a_im = w3;
         b_re = tree[1];
         b_im = tree[3];
         w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
         w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));

         w0 = fpr_add(w0, t0[0]);
         w1 = fpr_add(w1, t0[1]);
         w2 = fpr_add(w2, t0[2]);
         w3 = fpr_add(w3, t0[3]);

         /*
          * Second recursive invocation.
          */
         // Split
         a_re = w0;
         a_im = w2;
         b_re = w1;
         b_im = w3;
         c_re = fpr_add(a_re, b_re);
         c_im = fpr_add(a_im, b_im);
         w0 = fpr_half(c_re);
         w1 = fpr_half(c_im);
         c_re = fpr_sub(a_re, b_re);
         c_im = fpr_sub(a_im, b_im);
         w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
         w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);

         // Sampling
         x0 = w2;
         x1 = w3;
         sigma = tree0[3];
         w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
         w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
         a_re = fpr_sub(x0, y0);
         a_im = fpr_sub(x1, y1);
         b_re = tree0[0];
         b_im = tree0[1];
         c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
         c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
         x0 = fpr_add(c_re, w0);
         x1 = fpr_add(c_im, w1);
         sigma = tree0[2];
         w0 = fpr_of(samp(samp_ctx, x0, sigma));
         w1 = fpr_of(samp(samp_ctx, x1, sigma));

         // Merge
         a_re = w0;
         a_im = w1;
         b_re = w2;
         b_im = w3;
         c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
         c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
         z0[0] = fpr_add(a_re, c_re);
         z0[2] = fpr_add(a_im, c_im);
         z0[1] = fpr_sub(a_re, c_re);
         z0[3] = fpr_sub(a_im, c_im);

         return;
     }

     /*
      * Case logn == 1 is reachable only when using Falcon-2 (the
      * smallest size for which Falcon is mathematically defined, but
      * of course way too insecure to be of any use).
      */
     if (logn == 1) {
         fpr x0, x1, y0, y1, sigma;
         fpr a_re, a_im, b_re, b_im, c_re, c_im;

         x0 = t1[0];
         x1 = t1[1];
         sigma = tree[3];
         z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
         z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
         a_re = fpr_sub(x0, y0);
         a_im = fpr_sub(x1, y1);
         b_re = tree[0];
         b_im = tree[1];
         c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
         c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
         x0 = fpr_add(c_re, t0[0]);
         x1 = fpr_add(c_im, t0[1]);
         sigma = tree[2];
         z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
         z0[1] = fpr_of(samp(samp_ctx, x1, sigma));

         return;
     }

     /*
      * General recursive case (logn >= 2).
      */

     n = (size_t)1 << logn;
     hn = n >> 1;
     tree0 = tree + n;
     tree1 = tree + n + ffLDL_treesize(logn - 1);

     /*
      * We split t1 into z1 (reused as temporary storage), then do
      * the recursive invocation, with output in tmp. We finally
      * merge back into z1.
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
     ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
                    tree1, z1, z1 + hn, logn - 1, tmp + n);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(z1, tmp, tmp + hn, logn);

     /*
      * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(tmp, t1, z1, logn);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(tmp, t0, tmp, tree, logn);

     /*
      * Second recursive invocation.
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z0, z0 + hn, tmp, logn);
     ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
                    tree0, z0, z0 + hn, logn - 1, tmp + n);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(z0, tmp, tmp + hn, logn);
 }

 /*
  * Compute a signature: the signature contains two vectors, s1 and s2.
  * The s1 vector is not returned. The squared norm of (s1,s2) is
  * computed, and if it is short enough, then s2 is returned into the
  * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
  * returned; the caller should then try again. This function uses an
  * expanded key.
  *
  * tmp[] must have room for at least six polynomials.
  */
 static int
 do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
              const fpr *restrict expanded_key,
              const uint16_t *hm, fpr *restrict tmp) {
     fpr *t0, *t1, *tx, *ty;
     const fpr *b00, *b01, *b10, *b11, *tree;
     fpr ni;
     int16_t *s1tmp, *s2tmp;

     t0 = tmp;
     t1 = t0 + FALCON_N;
     b00 = expanded_key + skoff_b00(FALCON_LOGN);
     b01 = expanded_key + skoff_b01(FALCON_LOGN);
     b10 = expanded_key + skoff_b10(FALCON_LOGN);
     b11 = expanded_key + skoff_b11(FALCON_LOGN);
     tree = expanded_key + skoff_tree(FALCON_LOGN);

     /*
      * Set the target vector to [hm, 0] (hm is the hashed message).
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);

     /*
      * Apply the lattice basis to obtain the real target
      * vector (after normalization with regards to modulus).
      */
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(t0, FALCON_LOGN);
     ni = fpr_inverse_of_q;
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);

     tx = t1 + FALCON_N;
     ty = tx + FALCON_N;

     /*
      * Apply sampling. Output is written back in [tx, ty].
      */
     ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, FALCON_LOGN, ty + FALCON_N);

     /*
      * Get the lattice point corresponding to that tiny vector.
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t0, tx, b00, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t0, t0, ty, b10, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t0, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t1, tx, b01, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t1, t1, ty, b11, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t1, FALCON_LOGN);

     /*
      * Compute the signature.
      */

     /*
      * With "normal" degrees (e.g. 512 or 1024), it is very
      * improbable that the computed vector is not short enough;
      * however, it may happen in practice for the very reduced
      * versions (e.g. degree 16 or below). In that case, the caller
      * will loop, and we must not write anything into s2[] because
      * s2[] may overlap with the hashed message hm[] and we need
      * hm[] for the next iteration.
      */

     s1tmp = (int16_t *)tx;
     s2tmp = (int16_t *)tmp;

     if (PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
         memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
         memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
         return 1;
     }
     return 0;
 }

 /*
  * Compute a signature: the signature contains two vectors, s1 and s2.
  * The s1 vector is not returned. The squared norm of (s1,s2) is
  * computed, and if it is short enough, then s2 is returned into the
  * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
  * returned; the caller should then try again.
  *
  * tmp[] must have room for at least nine polynomials.
  */
 static int
 do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
             const int8_t *restrict f, const int8_t *restrict g,
             const int8_t *restrict F, const int8_t *restrict G,
             const uint16_t *hm, fpr *restrict tmp) {
     fpr *t0, *t1, *tx, *ty;
     fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
     fpr ni;
     int16_t *s1tmp, *s2tmp;

     /*
      * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
      */
     b00 = tmp;
     b01 = b00 + FALCON_N;
     b10 = b01 + FALCON_N;
     b11 = b10 + FALCON_N;
     t0 = b11 + FALCON_N;
     t1 = t0 + FALCON_N;

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(b00, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(b01, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b01, b01, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(b10, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(b11, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b11, b11, FALCON_LOGN);

     /*
      * Compute the Gram matrix G = B·B*. Formulas are:
      *   g00 = b00*adj(b00) + b01*adj(b01)
      *   g01 = b00*adj(b10) + b01*adj(b11)
      *   g10 = b10*adj(b00) + b11*adj(b01)
      *   g11 = b10*adj(b10) + b11*adj(b11)
      *
      * For historical reasons, this implementation uses
      * g00, g01 and g11 (upper triangle). g10 is not kept
      * since it is equal to adj(g01).
      *
      * We _replace_ the matrix B with the Gram matrix, but we
      * must keep b01 and b11 for computing the target vector.
      *
      * Memory layout:
      * b00 | b01 | b10 | b11 | t0 | t1
      * g00 | g01 | g11 | b01 | t0 | t1
      */

     PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(t1, b00, b10, FALCON_LOGN);   // t1 <- b00*adj(b10)

     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(t0, b01, FALCON_LOGN);    // t0 <- b01*adj(b01)
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(b00, b00, FALCON_LOGN);   // b00 <- b00*adj(b00)
     PQCLEAN_FALCONPADDED512_AARCH64_poly_add(b00, b00, t0, FALCON_LOGN);      // b00 <- g00

     memcpy(t0, b01, FALCON_N * sizeof * b01);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(b01, t1, b01, b11, FALCON_LOGN);  // b01 <- b01*adj(b11)

     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(b10, b10, FALCON_LOGN);   // b10 <- b10*adj(b10)
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(b10, b10, b11, FALCON_LOGN);    // t1 = g11 <- b11*adj(b11)

     /*
      * We rename variables to make things clearer. The three elements
      * of the Gram matrix uses the first 3*n slots of tmp[], followed
      * by b11 and b01 (in that order).
      */
     g00 = b00;
     g01 = b01;
     g11 = b10;
     b01 = t0;
     t0 = b01 + FALCON_N;
     t1 = t0 + FALCON_N;

     /*
      * Memory layout at that point:
      *   g00 g01 g11 b11 b01 t0 t1
      */

     /*
      * Set the target vector to [hm, 0] (hm is the hashed message).
      */
     PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);

     /*
      * Apply the lattice basis to obtain the real target
      * vector (after normalization with regards to modulus).
      */
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(t0, FALCON_LOGN);
     ni = fpr_inverse_of_q;
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);

     /*
      * b01 and b11 can be discarded, so we move back (t0,t1).
      * Memory layout is now:
      *      g00 g01 g11 t0 t1
      */
     memcpy(b11, t0, FALCON_N * 2 * sizeof * t0);
     t0 = g11 + FALCON_N;
     t1 = t0 + FALCON_N;

     /*
      * Apply sampling; result is written over (t0,t1).
      * t1, g00
      */
     ffSampling_fft_dyntree(samp, samp_ctx,
                            t0, t1, g00, g01, g11, FALCON_LOGN, FALCON_LOGN, t1 + FALCON_N);

     /*
      * We arrange the layout back to:
      *     b00 b01 b10 b11 t0 t1
      *
      * We did not conserve the matrix basis, so we must recompute
      * it now.
      */
     b00 = tmp;
     b01 = b00 + FALCON_N;
     b10 = b01 + FALCON_N;
     b11 = b10 + FALCON_N;
     memmove(b11 + FALCON_N, t0, FALCON_N * 2 * sizeof * t0);
     t0 = b11 + FALCON_N;
     t1 = t0 + FALCON_N;

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(b00, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(b01, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b01, b01, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(b10, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_FFT(b11, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b11, b11, FALCON_LOGN);

     tx = t1 + FALCON_N;
     ty = tx + FALCON_N;

     /*
      * Get the lattice point corresponding to that tiny vector.
      */

     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(tx, t0, b00, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(ty, t0, b01, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t0, tx, t1, b10, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t1, ty, t1, b11, FALCON_LOGN);

     PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t0, FALCON_LOGN);
     PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t1, FALCON_LOGN);

     /*
      * With "normal" degrees (e.g. 512 or 1024), it is very
      * improbable that the computed vector is not short enough;
      * however, it may happen in practice for the very reduced
      * versions (e.g. degree 16 or below). In that case, the caller
      * will loop, and we must not write anything into s2[] because
      * s2[] may overlap with the hashed message hm[] and we need
      * hm[] for the next iteration.
      */
     s1tmp = (int16_t *)tx;
     s2tmp = (int16_t *)tmp;

     if (PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
         memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
         memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
         return 1;
     }
     return 0;
 }

 /* see inner.h */
 void
 PQCLEAN_FALCONPADDED512_AARCH64_sign_tree(int16_t *sig, inner_shake256_context *rng,
         const fpr *restrict expanded_key,
         const uint16_t *hm, uint8_t *tmp) {
     fpr *ftmp;

     ftmp = (fpr *)tmp;
     for (;;) {
         /*
          * Signature produces short vectors s1 and s2. The
          * signature is acceptable only if the aggregate vector
          * s1,s2 is short; we must use the same bound as the
          * verifier.
          *
          * If the signature is acceptable, then we return only s2
          * (the verifier recomputes s1 from s2, the hashed message,
          * and the public key).
          */
         sampler_context spc;
         samplerZ samp;
         void *samp_ctx;

         /*
          * Normal sampling. We use a fast PRNG seeded from our
          * SHAKE context ('rng').
          */
         spc.sigma_min = fpr_sigma_min_9;
         PQCLEAN_FALCONPADDED512_AARCH64_prng_init(&spc.p, rng);
         samp = PQCLEAN_FALCONPADDED512_AARCH64_sampler;
         samp_ctx = &spc;

         /*
          * Do the actual signature.
          */
         if (do_sign_tree(samp, samp_ctx, sig, expanded_key, hm, ftmp)) {
             break;
         }
     }
 }

 /* see inner.h */
 void
 PQCLEAN_FALCONPADDED512_AARCH64_sign_dyn(int16_t *sig, inner_shake256_context *rng,
         const int8_t *restrict f, const int8_t *restrict g,
         const int8_t *restrict F, const int8_t *restrict G,
         const uint16_t *hm, uint8_t *tmp) {
     fpr *ftmp;

     ftmp = (fpr *)tmp;
     for (;;) {

         /*
          * Signature produces short vectors s1 and s2. The
          * signature is acceptable only if the aggregate vector
          * s1,s2 is short; we must use the same bound as the
          * verifier.
          *
          * If the signature is acceptable, then we return only s2
          * (the verifier recomputes s1 from s2, the hashed message,
          * and the public key).
          */
         sampler_context spc;
         samplerZ samp;
         void *samp_ctx;

         /*
          * Normal sampling. We use a fast PRNG seeded from our
          * SHAKE context ('rng').
          */

         spc.sigma_min = fpr_sigma_min_9;
         PQCLEAN_FALCONPADDED512_AARCH64_prng_init(&spc.p, rng);
         samp = PQCLEAN_FALCONPADDED512_AARCH64_sampler;
         samp_ctx = &spc;

         /*
          * Do the actual signature.
          */
         if (do_sign_dyn(samp, samp_ctx, sig, f, g, F, G, hm, ftmp)) {
             break;
         }
     }
 }