src/sig/falcon/pqclean_falcon-padded-512_avx2/rng.c - platform/external/liboqs - Git at Google

 /*
  * PRNG and interface to the system RNG.
  *
  * ==========================(LICENSE BEGIN)============================
  *
  * Copyright (c) 2017-2019  Falcon Project
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * ===========================(LICENSE END)=============================
  *
  * @author   Thomas Pornin <[email protected]>
  */

 #include <assert.h>

 #include "inner.h"

 /* see inner.h */
 void
 PQCLEAN_FALCONPADDED512_AVX2_prng_init(prng *p, inner_shake256_context *src) {
     inner_shake256_extract(src, p->state.d, 56);
     PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
 }

 /*
  * PRNG based on ChaCha20.
  *
  * State consists in key (32 bytes) then IV (16 bytes) and block counter
  * (8 bytes). Normally, we should not care about local endianness (this
  * is for a PRNG), but for the NIST competition we need reproducible KAT
  * vectors that work across architectures, so we enforce little-endian
  * interpretation where applicable. Moreover, output words are "spread
  * out" over the output buffer with the interleaving pattern that is
  * naturally obtained from the AVX2 implementation that runs eight
  * ChaCha20 instances in parallel.
  *
  * The block counter is XORed into the first 8 bytes of the IV.
  */
 void
 PQCLEAN_FALCONPADDED512_AVX2_prng_refill(prng *p) {

     static const uint32_t CW[] = {
         0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
     };

     uint64_t cc;
     size_t u;
     int i;
     uint32_t *sw;
     union {
         uint32_t w[16];
         __m256i y[2];  /* for alignment */
     } t;
     __m256i state[16], init[16];

     sw = (uint32_t *)p->state.d;

     /*
      * XOR next counter values into state.
      */
     cc = *(uint64_t *)(p->state.d + 48);
     for (u = 0; u < 8; u ++) {
         t.w[u] = (uint32_t)(cc + u);
         t.w[u + 8] = (uint32_t)((cc + u) >> 32);
     }
     *(uint64_t *)(p->state.d + 48) = cc + 8;

     /*
      * Load state.
      */
     for (u = 0; u < 4; u ++) {
         state[u] = init[u] =
                        _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)CW[u]));
     }
     for (u = 0; u < 10; u ++) {
         state[u + 4] = init[u + 4] =
                            _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[u]));
     }
     state[14] = init[14] = _mm256_xor_si256(
                                _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[10])),
                                _mm256_loadu_si256((__m256i *)&t.w[0]));
     state[15] = init[15] = _mm256_xor_si256(
                                _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[11])),
                                _mm256_loadu_si256((__m256i *)&t.w[8]));

     /*
      * Do all rounds.
      */
     for (i = 0; i < 10; i ++) {

 #define QROUND(a, b, c, d)   do { \
         state[a] = _mm256_add_epi32(state[a], state[b]); \
         state[d] = _mm256_xor_si256(state[d], state[a]); \
         state[d] = _mm256_or_si256( \
                                     _mm256_slli_epi32(state[d], 16), \
                                     _mm256_srli_epi32(state[d], 16)); \
         state[c] = _mm256_add_epi32(state[c], state[d]); \
         state[b] = _mm256_xor_si256(state[b], state[c]); \
         state[b] = _mm256_or_si256( \
                                     _mm256_slli_epi32(state[b], 12), \
                                     _mm256_srli_epi32(state[b], 20)); \
         state[a] = _mm256_add_epi32(state[a], state[b]); \
         state[d] = _mm256_xor_si256(state[d], state[a]); \
         state[d] = _mm256_or_si256( \
                                     _mm256_slli_epi32(state[d],  8), \
                                     _mm256_srli_epi32(state[d], 24)); \
         state[c] = _mm256_add_epi32(state[c], state[d]); \
         state[b] = _mm256_xor_si256(state[b], state[c]); \
         state[b] = _mm256_or_si256( \
                                     _mm256_slli_epi32(state[b], 7), \
                                     _mm256_srli_epi32(state[b], 25)); \
     } while (0)

         QROUND( 0,  4,  8, 12);
         QROUND( 1,  5,  9, 13);
         QROUND( 2,  6, 10, 14);
         QROUND( 3,  7, 11, 15);
         QROUND( 0,  5, 10, 15);
         QROUND( 1,  6, 11, 12);
         QROUND( 2,  7,  8, 13);
         QROUND( 3,  4,  9, 14);

 #undef QROUND

     }

     /*
      * Add initial state back and encode the result in the destination
      * buffer. We can dump the AVX2 values "as is" because the non-AVX2
      * code uses a compatible order of values.
      */
     for (u = 0; u < 16; u ++) {
         _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
                             _mm256_add_epi32(state[u], init[u]));
     }

     p->ptr = 0;
 }

 /* see inner.h */
 void
 PQCLEAN_FALCONPADDED512_AVX2_prng_get_bytes(prng *p, void *dst, size_t len) {
     uint8_t *buf;

     buf = dst;
     while (len > 0) {
         size_t clen;

         clen = (sizeof p->buf.d) - p->ptr;
         if (clen > len) {
             clen = len;
         }
         memcpy(buf, p->buf.d, clen);
         buf += clen;
         len -= clen;
         p->ptr += clen;
         if (p->ptr == sizeof p->buf.d) {
             PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
         }
     }
 }
	/*
	* PRNG and interface to the system RNG.
	*
	* ==========================(LICENSE BEGIN)============================
	*
	* Copyright (c) 2017-2019 Falcon Project
	*
	* Permission is hereby granted, free of charge, to any person obtaining
	* a copy of this software and associated documentation files (the
	* "Software"), to deal in the Software without restriction, including
	* without limitation the rights to use, copy, modify, merge, publish,
	* distribute, sublicense, and/or sell copies of the Software, and to
	* permit persons to whom the Software is furnished to do so, subject to
	* the following conditions:
	*
	* The above copyright notice and this permission notice shall be
	* included in all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	*
	* ===========================(LICENSE END)=============================
	*
	* @author Thomas Pornin <[email protected]>
	*/

	#include <assert.h>

	#include "inner.h"

	/* see inner.h */
	void
	PQCLEAN_FALCONPADDED512_AVX2_prng_init(prng p, inner_shake256_context src) {
	inner_shake256_extract(src, p->state.d, 56);
	PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
	}

	/*
	* PRNG based on ChaCha20.
	*
	* State consists in key (32 bytes) then IV (16 bytes) and block counter
	* (8 bytes). Normally, we should not care about local endianness (this
	* is for a PRNG), but for the NIST competition we need reproducible KAT
	* vectors that work across architectures, so we enforce little-endian
	* interpretation where applicable. Moreover, output words are "spread
	* out" over the output buffer with the interleaving pattern that is
	* naturally obtained from the AVX2 implementation that runs eight
	* ChaCha20 instances in parallel.
	*
	* The block counter is XORed into the first 8 bytes of the IV.
	*/
	void
	PQCLEAN_FALCONPADDED512_AVX2_prng_refill(prng *p) {

	static const uint32_t CW[] = {
	0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
	};

	uint64_t cc;
	size_t u;
	int i;
	uint32_t *sw;
	union {
	uint32_t w[16];
	__m256i y[2]; /* for alignment */
	} t;
	__m256i state[16], init[16];

	sw = (uint32_t *)p->state.d;

	/*
	* XOR next counter values into state.
	*/
	cc = (uint64_t )(p->state.d + 48);
	for (u = 0; u < 8; u ++) {
	t.w[u] = (uint32_t)(cc + u);
	t.w[u + 8] = (uint32_t)((cc + u) >> 32);
	}
	(uint64_t )(p->state.d + 48) = cc + 8;

	/*
	* Load state.
	*/
	for (u = 0; u < 4; u ++) {
	state[u] = init[u] =
	_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)CW[u]));
	}
	for (u = 0; u < 10; u ++) {
	state[u + 4] = init[u + 4] =
	_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[u]));
	}
	state[14] = init[14] = _mm256_xor_si256(
	_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[10])),
	_mm256_loadu_si256((__m256i *)&t.w[0]));
	state[15] = init[15] = _mm256_xor_si256(
	_mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[11])),
	_mm256_loadu_si256((__m256i *)&t.w[8]));

	/*
	* Do all rounds.
	*/
	for (i = 0; i < 10; i ++) {

	#define QROUND(a, b, c, d) do { \
	state[a] = _mm256_add_epi32(state[a], state[b]); \
	state[d] = _mm256_xor_si256(state[d], state[a]); \
	state[d] = _mm256_or_si256( \
	_mm256_slli_epi32(state[d], 16), \
	_mm256_srli_epi32(state[d], 16)); \
	state[c] = _mm256_add_epi32(state[c], state[d]); \
	state[b] = _mm256_xor_si256(state[b], state[c]); \
	state[b] = _mm256_or_si256( \
	_mm256_slli_epi32(state[b], 12), \
	_mm256_srli_epi32(state[b], 20)); \
	state[a] = _mm256_add_epi32(state[a], state[b]); \
	state[d] = _mm256_xor_si256(state[d], state[a]); \
	state[d] = _mm256_or_si256( \
	_mm256_slli_epi32(state[d], 8), \
	_mm256_srli_epi32(state[d], 24)); \
	state[c] = _mm256_add_epi32(state[c], state[d]); \
	state[b] = _mm256_xor_si256(state[b], state[c]); \
	state[b] = _mm256_or_si256( \
	_mm256_slli_epi32(state[b], 7), \
	_mm256_srli_epi32(state[b], 25)); \
	} while (0)

	QROUND( 0, 4, 8, 12);
	QROUND( 1, 5, 9, 13);
	QROUND( 2, 6, 10, 14);
	QROUND( 3, 7, 11, 15);
	QROUND( 0, 5, 10, 15);
	QROUND( 1, 6, 11, 12);
	QROUND( 2, 7, 8, 13);
	QROUND( 3, 4, 9, 14);

	#undef QROUND

	}

	/*
	* Add initial state back and encode the result in the destination
	* buffer. We can dump the AVX2 values "as is" because the non-AVX2
	* code uses a compatible order of values.
	*/
	for (u = 0; u < 16; u ++) {
	_mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
	_mm256_add_epi32(state[u], init[u]));
	}

	p->ptr = 0;
	}

	/* see inner.h */
	void
	PQCLEAN_FALCONPADDED512_AVX2_prng_get_bytes(prng p, void dst, size_t len) {
	uint8_t *buf;

	buf = dst;
	while (len > 0) {
	size_t clen;

	clen = (sizeof p->buf.d) - p->ptr;
	if (clen > len) {
	clen = len;
	}
	memcpy(buf, p->buf.d, clen);
	buf += clen;
	len -= clen;
	p->ptr += clen;
	if (p->ptr == sizeof p->buf.d) {
	PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
	}
	}
	}