| // SPDX-License-Identifier: Public domain |
| |
| #include <oqs/oqs.h> |
| |
| #include "sha2_local.h" |
| |
| #include <stdint.h> |
| // ARM includes |
| #ifndef WIN32 |
| #include <arm_acle.h> |
| #endif |
| #include <arm_neon.h> |
| /* Based on the public domain implementation in |
| * crypto_hashblocks/sha256/dolbeau/armv8crypto |
| * from http://bench.cr.yp.to/supercop.html |
| * by D. J. Bernstein */ |
| |
| |
| static uint64_t load_bigendian_64(const uint8_t *x) { |
| return (uint64_t)(x[7]) | (((uint64_t)(x[6])) << 8) | |
| (((uint64_t)(x[5])) << 16) | (((uint64_t)(x[4])) << 24) | |
| (((uint64_t)(x[3])) << 32) | (((uint64_t)(x[2])) << 40) | |
| (((uint64_t)(x[1])) << 48) | (((uint64_t)(x[0])) << 56); |
| } |
| |
| static void store_bigendian_64(uint8_t *x, uint64_t u) { |
| x[7] = (uint8_t) u; |
| u >>= 8; |
| x[6] = (uint8_t) u; |
| u >>= 8; |
| x[5] = (uint8_t) u; |
| u >>= 8; |
| x[4] = (uint8_t) u; |
| u >>= 8; |
| x[3] = (uint8_t) u; |
| u >>= 8; |
| x[2] = (uint8_t) u; |
| u >>= 8; |
| x[1] = (uint8_t) u; |
| u >>= 8; |
| x[0] = (uint8_t) u; |
| } |
| |
| static size_t crypto_hashblocks_sha256_armv8(uint8_t *statebytes, |
| const uint8_t *data, size_t length) { |
| static unsigned int s256cst[64] = { |
| 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
| 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
| 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
| 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
| 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
| 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
| 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
| 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
| 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
| 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
| 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
| 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
| 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
| 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
| 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
| 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
| }; |
| unsigned long long pos = 0; |
| /* load constants */ |
| uint32x4_t c0 = vld1q_u32(s256cst + 0); |
| uint32x4_t c1 = vld1q_u32(s256cst + 4); |
| uint32x4_t c2 = vld1q_u32(s256cst + 8); |
| uint32x4_t c3 = vld1q_u32(s256cst + 12); |
| uint32x4_t c4 = vld1q_u32(s256cst + 16); |
| uint32x4_t c5 = vld1q_u32(s256cst + 20); |
| uint32x4_t c6 = vld1q_u32(s256cst + 24); |
| uint32x4_t c7 = vld1q_u32(s256cst + 28); |
| uint32x4_t c8 = vld1q_u32(s256cst + 32); |
| uint32x4_t c9 = vld1q_u32(s256cst + 36); |
| uint32x4_t ca = vld1q_u32(s256cst + 40); |
| uint32x4_t cb = vld1q_u32(s256cst + 44); |
| uint32x4_t cc = vld1q_u32(s256cst + 48); |
| uint32x4_t cd = vld1q_u32(s256cst + 52); |
| uint32x4_t ce = vld1q_u32(s256cst + 56); |
| uint32x4_t cf = vld1q_u32(s256cst + 60); |
| /* load state */ |
| uint32x4_t d0 = vld1q_u32((uint32_t *)(statebytes + 0)); |
| uint32x4_t d1 = vld1q_u32((uint32_t *)(statebytes + 16)); |
| uint32x4_t s0, s1, h0, h1; |
| /* make state big-endian */ |
| d0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(d0))); |
| d1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(d1))); |
| while (length >= 64) { |
| /* load one block */ |
| uint32x4_t i0 = vld1q_u32((const uint32_t *)(data + pos + 0)); |
| uint32x4_t i1 = vld1q_u32((const uint32_t *)(data + pos + 16)); |
| uint32x4_t i2 = vld1q_u32((const uint32_t *)(data + pos + 32)); |
| uint32x4_t i3 = vld1q_u32((const uint32_t *)(data + pos + 48)); |
| uint32x4_t j0, j1, j2, j3; |
| uint32x4_t x0, x1; |
| |
| /* copy state to working space */ |
| s0 = d0; |
| s1 = d1; |
| |
| /* make block big-endian */ |
| i0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(i0))); |
| i1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(i1))); |
| i2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(i2))); |
| i3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(i3))); |
| |
| /* |
| * This computes 16 rounds in i0..i3 |
| * using 16 constants in c0..c3 |
| * we need h0,h1,x0,x1 as scratch |
| */ |
| #define DO16ROUNDS(i0, i1, i2, i3, c0, c1, c2, c3) \ |
| h0 = vaddq_u32(i0, c0); \ |
| x0 = vsha256hq_u32(s0, s1, h0); \ |
| x1 = vsha256h2q_u32(s1, s0, h0); \ |
| h1 = vaddq_u32(i1, c1); \ |
| s0 = vsha256hq_u32(x0, x1, h1); \ |
| s1 = vsha256h2q_u32(x1, x0, h1); \ |
| h0 = vaddq_u32(i2, c2); \ |
| x0 = vsha256hq_u32(s0, s1, h0); \ |
| x1 = vsha256h2q_u32(s1, s0, h0); \ |
| h1 = vaddq_u32(i3, c3); \ |
| s0 = vsha256hq_u32(x0, x1, h1); \ |
| s1 = vsha256h2q_u32(x1, x0, h1) |
| |
| /* |
| * this expands the block (or previously |
| * expanded) in i0..i3 to j0..j3 |
| */ |
| #define DO16EXPANDS(i0, i1, i2, i3, j0, j1, j2, j3) \ |
| j0 = vsha256su0q_u32(i0, i1); \ |
| j0 = vsha256su1q_u32(j0, i2, i3); \ |
| j1 = vsha256su0q_u32(i1, i2); \ |
| j1 = vsha256su1q_u32(j1, i3, j0); \ |
| j2 = vsha256su0q_u32(i2, i3); \ |
| j2 = vsha256su1q_u32(j2, j0, j1); \ |
| j3 = vsha256su0q_u32(i3, j0); \ |
| j3 = vsha256su1q_u32(j3, j1, j2) |
| |
| DO16ROUNDS(i0, i1, i2, i3, c0, c1, c2, c3); |
| |
| DO16EXPANDS(i0, i1, i2, i3, j0, j1, j2, j3); |
| |
| DO16ROUNDS(j0, j1, j2, j3, c4, c5, c6, c7); |
| |
| DO16EXPANDS(j0, j1, j2, j3, i0, i1, i2, i3); |
| |
| DO16ROUNDS(i0, i1, i2, i3, c8, c9, ca, cb); |
| |
| DO16EXPANDS(i0, i1, i2, i3, j0, j1, j2, j3); |
| |
| DO16ROUNDS(j0, j1, j2, j3, cc, cd, ce, cf); |
| |
| /* final add to update states */ |
| d0 = vaddq_u32(s0, d0); |
| d1 = vaddq_u32(s1, d1); |
| |
| length -= 64; |
| pos += 64; |
| } |
| |
| /* store back to little-endian */ |
| d0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(d0))); |
| d1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(d1))); |
| vst1q_u32((uint32_t *)(statebytes + 0), d0); |
| vst1q_u32((uint32_t *)(statebytes + 16), d1); |
| |
| return length; |
| |
| } |
| void oqs_sha2_sha256_inc_finalize_armv8(uint8_t *out, sha256ctx *state, const uint8_t *in, size_t inlen) { |
| uint8_t padded[128]; |
| uint64_t bytes = load_bigendian_64(state->ctx + 32) + inlen; |
| |
| crypto_hashblocks_sha256_armv8(state->ctx, in, inlen); |
| in += inlen; |
| inlen &= 63; |
| in -= inlen; |
| |
| |
| for (size_t i = 0; i < inlen; ++i) { |
| padded[i] = in[i]; |
| } |
| padded[inlen] = 0x80; |
| |
| if (inlen < 56) { |
| for (size_t i = inlen + 1; i < 56; ++i) { |
| padded[i] = 0; |
| } |
| padded[56] = (uint8_t) (bytes >> 53); |
| padded[57] = (uint8_t) (bytes >> 45); |
| padded[58] = (uint8_t) (bytes >> 37); |
| padded[59] = (uint8_t) (bytes >> 29); |
| padded[60] = (uint8_t) (bytes >> 21); |
| padded[61] = (uint8_t) (bytes >> 13); |
| padded[62] = (uint8_t) (bytes >> 5); |
| padded[63] = (uint8_t) (bytes << 3); |
| crypto_hashblocks_sha256_armv8(state->ctx, padded, 64); |
| } else { |
| for (size_t i = inlen + 1; i < 120; ++i) { |
| padded[i] = 0; |
| } |
| padded[120] = (uint8_t) (bytes >> 53); |
| padded[121] = (uint8_t) (bytes >> 45); |
| padded[122] = (uint8_t) (bytes >> 37); |
| padded[123] = (uint8_t) (bytes >> 29); |
| padded[124] = (uint8_t) (bytes >> 21); |
| padded[125] = (uint8_t) (bytes >> 13); |
| padded[126] = (uint8_t) (bytes >> 5); |
| padded[127] = (uint8_t) (bytes << 3); |
| crypto_hashblocks_sha256_armv8(state->ctx, padded, 128); |
| } |
| |
| for (size_t i = 0; i < 32; ++i) { |
| out[i] = state->ctx[i]; |
| } |
| oqs_sha2_sha256_inc_ctx_release_c(state); |
| } |
| |
| void oqs_sha2_sha224_inc_finalize_armv8(uint8_t *out, sha224ctx *state, const uint8_t *in, size_t inlen) { |
| uint8_t tmp[32]; |
| oqs_sha2_sha256_inc_finalize_armv8(tmp, (sha256ctx *)state, in, inlen); |
| |
| for (size_t i = 0; i < 28; ++i) { |
| out[i] = tmp[i]; |
| } |
| } |
| |
| void oqs_sha2_sha256_inc_blocks_armv8(sha256ctx *state, const uint8_t *in, size_t inblocks) { |
| uint64_t bytes = load_bigendian_64(state->ctx + 32); |
| |
| crypto_hashblocks_sha256_armv8(state->ctx, in, 64 * inblocks); |
| bytes += 64 * inblocks; |
| |
| store_bigendian_64(state->ctx + 32, bytes); |
| } |
| |
| void oqs_sha2_sha224_inc_blocks_armv8(sha224ctx *state, const uint8_t *in, size_t inblocks) { |
| oqs_sha2_sha256_inc_blocks_armv8((sha256ctx *) state, in, inblocks); |
| } |
| |
| void oqs_sha2_sha256_armv8(uint8_t *out, const uint8_t *in, size_t inlen) { |
| sha256ctx state; |
| |
| oqs_sha2_sha256_inc_init_c(&state); |
| oqs_sha2_sha256_inc_finalize_armv8(out, &state, in, inlen); |
| } |
| |
| void oqs_sha2_sha224_armv8(uint8_t *out, const uint8_t *in, size_t inlen) { |
| sha224ctx state; |
| |
| oqs_sha2_sha224_inc_init_c(&state); |
| oqs_sha2_sha224_inc_finalize_armv8(out, &state, in, inlen); |
| } |