| /*------------------------------------------------------------------------ |
| / OCB Version 3 Reference Code (Optimized C) Last modified 12-JUN-2013 |
| /------------------------------------------------------------------------- |
| / Copyright (c) 2013 Ted Krovetz. |
| / |
| / Permission to use, copy, modify, and/or distribute this software for any |
| / purpose with or without fee is hereby granted, provided that the above |
| / copyright notice and this permission notice appear in all copies. |
| / |
| / THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| / WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| / MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
| / ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| / WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| / ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
| / OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| / |
| / Phillip Rogaway holds patents relevant to OCB. See the following for |
| / his patent grant: http://www.cs.ucdavis.edu/~rogaway/ocb/grant.htm |
| / |
| / Special thanks to Keegan McAllister for suggesting several good improvements |
| / |
| / Comments are welcome: Ted Krovetz <[email protected]> - Dedicated to Laurel K |
| /------------------------------------------------------------------------- */ |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Usage notes */ |
| /* ----------------------------------------------------------------------- */ |
| |
| /* - When AE_PENDING is passed as the 'final' parameter of any function, |
| / the length parameters must be a multiple of (BPI*16). |
| / - When available, SSE or AltiVec registers are used to manipulate data. |
| / So, when on machines with these facilities, all pointers passed to |
| / any function should be 16-byte aligned. |
| / - Plaintext and ciphertext pointers may be equal (ie, plaintext gets |
| / encrypted in-place), but no other pair of pointers may be equal. |
| / - This code assumes all x86 processors have SSE2 and SSSE3 instructions |
| / when compiling under MSVC. If untrue, alter the #define. |
| / - This code is tested for C99 and recent versions of GCC and MSVC. */ |
| |
| /* ----------------------------------------------------------------------- */ |
| /* User configuration options */ |
| /* ----------------------------------------------------------------------- */ |
| |
| /* Set the AES key length to use and length of authentication tag to produce. |
| / Setting either to 0 requires the value be set at runtime via ae_init(). |
| / Some optimizations occur for each when set to a fixed value. */ |
| #define OCB_KEY_LEN 16 /* 0, 16, 24 or 32. 0 means set in ae_init */ |
| #define OCB_TAG_LEN 16 /* 0 to 16. 0 means set in ae_init */ |
| |
| /* This implementation has built-in support for multiple AES APIs. Set any |
| / one of the following to non-zero to specify which to use. */ |
| #define USE_OPENSSL_AES 1 /* http://openssl.org */ |
| #define USE_REFERENCE_AES 0 /* Internet search: rijndael-alg-fst.c */ |
| #define USE_AES_NI 0 /* Uses compiler's intrinsics */ |
| |
| /* During encryption and decryption, various "L values" are required. |
| / The L values can be precomputed during initialization (requiring extra |
| / space in ae_ctx), generated as needed (slightly slowing encryption and |
| / decryption), or some combination of the two. L_TABLE_SZ specifies how many |
| / L values to precompute. L_TABLE_SZ must be at least 3. L_TABLE_SZ*16 bytes |
| / are used for L values in ae_ctx. Plaintext and ciphertexts shorter than |
| / 2^L_TABLE_SZ blocks need no L values calculated dynamically. */ |
| #define L_TABLE_SZ 16 |
| |
| /* Set L_TABLE_SZ_IS_ENOUGH non-zero iff you know that all plaintexts |
| / will be shorter than 2^(L_TABLE_SZ+4) bytes in length. This results |
| / in better performance. */ |
| #define L_TABLE_SZ_IS_ENOUGH 1 |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Includes and compiler specific definitions */ |
| /* ----------------------------------------------------------------------- */ |
| |
| #include "ae.h" |
| #include <stdlib.h> |
| #include <string.h> |
| |
| /* Define standard sized integers */ |
| #if defined(_MSC_VER) && (_MSC_VER < 1600) |
| typedef unsigned __int8 uint8_t; |
| typedef unsigned __int32 uint32_t; |
| typedef unsigned __int64 uint64_t; |
| typedef __int64 int64_t; |
| #else |
| #include <stdint.h> |
| #endif |
| |
| /* Compiler-specific intrinsics and fixes: bswap64, ntz */ |
| #if _MSC_VER |
| #define inline __inline /* MSVC doesn't recognize "inline" in C */ |
| #define restrict __restrict /* MSVC doesn't recognize "restrict" in C */ |
| #define __SSE2__ (_M_IX86 || _M_AMD64 || _M_X64) /* Assume SSE2 */ |
| #define __SSSE3__ (_M_IX86 || _M_AMD64 || _M_X64) /* Assume SSSE3 */ |
| #include <intrin.h> |
| #pragma intrinsic(_byteswap_uint64, _BitScanForward, memcpy) |
| #define bswap64(x) _byteswap_uint64(x) |
| static inline unsigned ntz(unsigned x) { |
| _BitScanForward(&x, x); |
| return x; |
| } |
| #elif __GNUC__ |
| #define inline __inline__ /* No "inline" in GCC ansi C mode */ |
| #define restrict __restrict__ /* No "restrict" in GCC ansi C mode */ |
| #define bswap64(x) __builtin_bswap64(x) /* Assuming GCC 4.3+ */ |
| #define ntz(x) __builtin_ctz((unsigned)(x)) /* Assuming GCC 3.4+ */ |
| #else /* Assume some C99 features: stdint.h, inline, restrict */ |
| #define bswap32(x) \ |
| ((((x)&0xff000000u) >> 24) | (((x)&0x00ff0000u) >> 8) | (((x)&0x0000ff00u) << 8) | \ |
| (((x)&0x000000ffu) << 24)) |
| |
| static inline uint64_t bswap64(uint64_t x) { |
| union { |
| uint64_t u64; |
| uint32_t u32[2]; |
| } in, out; |
| in.u64 = x; |
| out.u32[0] = bswap32(in.u32[1]); |
| out.u32[1] = bswap32(in.u32[0]); |
| return out.u64; |
| } |
| |
| #if (L_TABLE_SZ <= 9) && (L_TABLE_SZ_IS_ENOUGH) /* < 2^13 byte texts */ |
| static inline unsigned ntz(unsigned x) { |
| static const unsigned char tz_table[] = { |
| 0, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2, 6, 2, 3, 2, 4, 2, 3, 2, 5, 2, |
| 3, 2, 4, 2, 3, 2, 7, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2, 6, 2, 3, 2, |
| 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2, 8, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, |
| 3, 2, 6, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2, 7, 2, 3, 2, 4, 2, 3, 2, |
| 5, 2, 3, 2, 4, 2, 3, 2, 6, 2, 3, 2, 4, 2, 3, 2, 5, 2, 3, 2, 4, 2, 3, 2}; |
| return tz_table[x / 4]; |
| } |
| #else /* From http://supertech.csail.mit.edu/papers/debruijn.pdf */ |
| static inline unsigned ntz(unsigned x) { |
| static const unsigned char tz_table[32] = {0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, |
| 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, |
| 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; |
| return tz_table[((uint32_t)((x & -x) * 0x077CB531u)) >> 27]; |
| } |
| #endif |
| #endif |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Define blocks and operations -- Patch if incorrect on your compiler. */ |
| /* ----------------------------------------------------------------------- */ |
| |
| #if __SSE2__ && !KEYMASTER_CLANG_TEST_BUILD |
| #include <xmmintrin.h> /* SSE instructions and _mm_malloc */ |
| #include <emmintrin.h> /* SSE2 instructions */ |
| typedef __m128i block; |
| #define xor_block(x, y) _mm_xor_si128(x, y) |
| #define zero_block() _mm_setzero_si128() |
| #define unequal_blocks(x, y) (_mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) != 0xffff) |
| #if __SSSE3__ || USE_AES_NI |
| #include <tmmintrin.h> /* SSSE3 instructions */ |
| #define swap_if_le(b) \ |
| _mm_shuffle_epi8(b, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)) |
| #else |
| static inline block swap_if_le(block b) { |
| block a = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 1, 2, 3)); |
| a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1)); |
| a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1)); |
| return _mm_xor_si128(_mm_srli_epi16(a, 8), _mm_slli_epi16(a, 8)); |
| } |
| #endif |
| static inline block gen_offset(uint64_t KtopStr[3], unsigned bot) { |
| block hi = _mm_load_si128((__m128i*)(KtopStr + 0)); /* hi = B A */ |
| block lo = _mm_loadu_si128((__m128i*)(KtopStr + 1)); /* lo = C B */ |
| __m128i lshift = _mm_cvtsi32_si128(bot); |
| __m128i rshift = _mm_cvtsi32_si128(64 - bot); |
| lo = _mm_xor_si128(_mm_sll_epi64(hi, lshift), _mm_srl_epi64(lo, rshift)); |
| #if __SSSE3__ || USE_AES_NI |
| return _mm_shuffle_epi8(lo, _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); |
| #else |
| return swap_if_le(_mm_shuffle_epi32(lo, _MM_SHUFFLE(1, 0, 3, 2))); |
| #endif |
| } |
| static inline block double_block(block bl) { |
| const __m128i mask = _mm_set_epi32(135, 1, 1, 1); |
| __m128i tmp = _mm_srai_epi32(bl, 31); |
| tmp = _mm_and_si128(tmp, mask); |
| tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3)); |
| bl = _mm_slli_epi32(bl, 1); |
| return _mm_xor_si128(bl, tmp); |
| } |
| #elif __ALTIVEC__ |
| #include <altivec.h> |
| typedef vector unsigned block; |
| #define xor_block(x, y) vec_xor(x, y) |
| #define zero_block() vec_splat_u32(0) |
| #define unequal_blocks(x, y) vec_any_ne(x, y) |
| #define swap_if_le(b) (b) |
| #if __PPC64__ |
| block gen_offset(uint64_t KtopStr[3], unsigned bot) { |
| union { |
| uint64_t u64[2]; |
| block bl; |
| } rval; |
| rval.u64[0] = (KtopStr[0] << bot) | (KtopStr[1] >> (64 - bot)); |
| rval.u64[1] = (KtopStr[1] << bot) | (KtopStr[2] >> (64 - bot)); |
| return rval.bl; |
| } |
| #else |
| /* Special handling: Shifts are mod 32, and no 64-bit types */ |
| block gen_offset(uint64_t KtopStr[3], unsigned bot) { |
| const vector unsigned k32 = {32, 32, 32, 32}; |
| vector unsigned hi = *(vector unsigned*)(KtopStr + 0); |
| vector unsigned lo = *(vector unsigned*)(KtopStr + 2); |
| vector unsigned bot_vec; |
| if (bot < 32) { |
| lo = vec_sld(hi, lo, 4); |
| } else { |
| vector unsigned t = vec_sld(hi, lo, 4); |
| lo = vec_sld(hi, lo, 8); |
| hi = t; |
| bot = bot - 32; |
| } |
| if (bot == 0) |
| return hi; |
| *(unsigned*)&bot_vec = bot; |
| vector unsigned lshift = vec_splat(bot_vec, 0); |
| vector unsigned rshift = vec_sub(k32, lshift); |
| hi = vec_sl(hi, lshift); |
| lo = vec_sr(lo, rshift); |
| return vec_xor(hi, lo); |
| } |
| #endif |
| static inline block double_block(block b) { |
| const vector unsigned char mask = {135, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; |
| const vector unsigned char perm = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0}; |
| const vector unsigned char shift7 = vec_splat_u8(7); |
| const vector unsigned char shift1 = vec_splat_u8(1); |
| vector unsigned char c = (vector unsigned char)b; |
| vector unsigned char t = vec_sra(c, shift7); |
| t = vec_and(t, mask); |
| t = vec_perm(t, t, perm); |
| c = vec_sl(c, shift1); |
| return (block)vec_xor(c, t); |
| } |
| #elif __ARM_NEON__ |
| #include <arm_neon.h> |
| typedef int8x16_t block; /* Yay! Endian-neutral reads! */ |
| #define xor_block(x, y) veorq_s8(x, y) |
| #define zero_block() vdupq_n_s8(0) |
| static inline int unequal_blocks(block a, block b) { |
| int64x2_t t = veorq_s64((int64x2_t)a, (int64x2_t)b); |
| return (vgetq_lane_s64(t, 0) | vgetq_lane_s64(t, 1)) != 0; |
| } |
| #define swap_if_le(b) (b) /* Using endian-neutral int8x16_t */ |
| /* KtopStr is reg correct by 64 bits, return mem correct */ |
| block gen_offset(uint64_t KtopStr[3], unsigned bot) { |
| const union { |
| unsigned x; |
| unsigned char endian; |
| } little = {1}; |
| const int64x2_t k64 = {-64, -64}; |
| uint64x2_t hi = *(uint64x2_t*)(KtopStr + 0); /* hi = A B */ |
| uint64x2_t lo = *(uint64x2_t*)(KtopStr + 1); /* hi = B C */ |
| int64x2_t ls = vdupq_n_s64(bot); |
| int64x2_t rs = vqaddq_s64(k64, ls); |
| block rval = (block)veorq_u64(vshlq_u64(hi, ls), vshlq_u64(lo, rs)); |
| if (little.endian) |
| rval = vrev64q_s8(rval); |
| return rval; |
| } |
| static inline block double_block(block b) { |
| const block mask = {135, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; |
| block tmp = vshrq_n_s8(b, 7); |
| tmp = vandq_s8(tmp, mask); |
| tmp = vextq_s8(tmp, tmp, 1); /* Rotate high byte to end */ |
| b = vshlq_n_s8(b, 1); |
| return veorq_s8(tmp, b); |
| } |
| #else |
| typedef struct { uint64_t l, r; } block; |
| static inline block xor_block(block x, block y) { |
| x.l ^= y.l; |
| x.r ^= y.r; |
| return x; |
| } |
| static inline block zero_block(void) { |
| const block t = {0, 0}; |
| return t; |
| } |
| #define unequal_blocks(x, y) ((((x).l ^ (y).l) | ((x).r ^ (y).r)) != 0) |
| static inline block swap_if_le(block b) { |
| const union { |
| unsigned x; |
| unsigned char endian; |
| } little = {1}; |
| if (little.endian) { |
| block r; |
| r.l = bswap64(b.l); |
| r.r = bswap64(b.r); |
| return r; |
| } else |
| return b; |
| } |
| |
| /* KtopStr is reg correct by 64 bits, return mem correct */ |
| block gen_offset(uint64_t KtopStr[3], unsigned bot) { |
| block rval; |
| if (bot != 0) { |
| rval.l = (KtopStr[0] << bot) | (KtopStr[1] >> (64 - bot)); |
| rval.r = (KtopStr[1] << bot) | (KtopStr[2] >> (64 - bot)); |
| } else { |
| rval.l = KtopStr[0]; |
| rval.r = KtopStr[1]; |
| } |
| return swap_if_le(rval); |
| } |
| |
| #if __GNUC__ && __arm__ |
| static inline block double_block(block b) { |
| __asm__("adds %1,%1,%1\n\t" |
| "adcs %H1,%H1,%H1\n\t" |
| "adcs %0,%0,%0\n\t" |
| "adcs %H0,%H0,%H0\n\t" |
| "it cs\n\t" |
| "eorcs %1,%1,#135" |
| : "+r"(b.l), "+r"(b.r) |
| : |
| : "cc"); |
| return b; |
| } |
| #else |
| static inline block double_block(block b) { |
| uint64_t t = (uint64_t)((int64_t)b.l >> 63); |
| b.l = (b.l + b.l) ^ (b.r >> 63); |
| b.r = (b.r + b.r) ^ (t & 135); |
| return b; |
| } |
| #endif |
| |
| #endif |
| |
| /* ----------------------------------------------------------------------- */ |
| /* AES - Code uses OpenSSL API. Other implementations get mapped to it. */ |
| /* ----------------------------------------------------------------------- */ |
| |
| /*---------------*/ |
| #if USE_OPENSSL_AES |
| /*---------------*/ |
| |
| #include <openssl/aes.h> /* http://openssl.org/ */ |
| |
| /* How to ECB encrypt an array of blocks, in place */ |
| static inline void AES_ecb_encrypt_blks(block* blks, unsigned nblks, AES_KEY* key) { |
| while (nblks) { |
| --nblks; |
| AES_encrypt((unsigned char*)(blks + nblks), (unsigned char*)(blks + nblks), key); |
| } |
| } |
| |
| static inline void AES_ecb_decrypt_blks(block* blks, unsigned nblks, AES_KEY* key) { |
| while (nblks) { |
| --nblks; |
| AES_decrypt((unsigned char*)(blks + nblks), (unsigned char*)(blks + nblks), key); |
| } |
| } |
| |
| #define BPI 4 /* Number of blocks in buffer per ECB call */ |
| |
| /*-------------------*/ |
| #elif USE_REFERENCE_AES |
| /*-------------------*/ |
| |
| #include "rijndael-alg-fst.h" /* Barreto's Public-Domain Code */ |
| #if (OCB_KEY_LEN == 0) |
| typedef struct { |
| uint32_t rd_key[60]; |
| int rounds; |
| } AES_KEY; |
| #define ROUNDS(ctx) ((ctx)->rounds) |
| #define AES_set_encrypt_key(x, y, z) \ |
| do { \ |
| rijndaelKeySetupEnc((z)->rd_key, x, y); \ |
| (z)->rounds = y / 32 + 6; \ |
| } while (0) |
| #define AES_set_decrypt_key(x, y, z) \ |
| do { \ |
| rijndaelKeySetupDec((z)->rd_key, x, y); \ |
| (z)->rounds = y / 32 + 6; \ |
| } while (0) |
| #else |
| typedef struct { uint32_t rd_key[OCB_KEY_LEN + 28]; } AES_KEY; |
| #define ROUNDS(ctx) (6 + OCB_KEY_LEN / 4) |
| #define AES_set_encrypt_key(x, y, z) rijndaelKeySetupEnc((z)->rd_key, x, y) |
| #define AES_set_decrypt_key(x, y, z) rijndaelKeySetupDec((z)->rd_key, x, y) |
| #endif |
| #define AES_encrypt(x, y, z) rijndaelEncrypt((z)->rd_key, ROUNDS(z), x, y) |
| #define AES_decrypt(x, y, z) rijndaelDecrypt((z)->rd_key, ROUNDS(z), x, y) |
| |
| static void AES_ecb_encrypt_blks(block* blks, unsigned nblks, AES_KEY* key) { |
| while (nblks) { |
| --nblks; |
| AES_encrypt((unsigned char*)(blks + nblks), (unsigned char*)(blks + nblks), key); |
| } |
| } |
| |
| void AES_ecb_decrypt_blks(block* blks, unsigned nblks, AES_KEY* key) { |
| while (nblks) { |
| --nblks; |
| AES_decrypt((unsigned char*)(blks + nblks), (unsigned char*)(blks + nblks), key); |
| } |
| } |
| |
| #define BPI 4 /* Number of blocks in buffer per ECB call */ |
| |
| /*----------*/ |
| #elif USE_AES_NI |
| /*----------*/ |
| |
| #include <wmmintrin.h> |
| |
| #if (OCB_KEY_LEN == 0) |
| typedef struct { |
| __m128i rd_key[15]; |
| int rounds; |
| } AES_KEY; |
| #define ROUNDS(ctx) ((ctx)->rounds) |
| #else |
| typedef struct { __m128i rd_key[7 + OCB_KEY_LEN / 4]; } AES_KEY; |
| #define ROUNDS(ctx) (6 + OCB_KEY_LEN / 4) |
| #endif |
| |
| #define EXPAND_ASSIST(v1, v2, v3, v4, shuff_const, aes_const) \ |
| v2 = _mm_aeskeygenassist_si128(v4, aes_const); \ |
| v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3), _mm_castsi128_ps(v1), 16)); \ |
| v1 = _mm_xor_si128(v1, v3); \ |
| v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3), _mm_castsi128_ps(v1), 140)); \ |
| v1 = _mm_xor_si128(v1, v3); \ |
| v2 = _mm_shuffle_epi32(v2, shuff_const); \ |
| v1 = _mm_xor_si128(v1, v2) |
| |
| #define EXPAND192_STEP(idx, aes_const) \ |
| EXPAND_ASSIST(x0, x1, x2, x3, 85, aes_const); \ |
| x3 = _mm_xor_si128(x3, _mm_slli_si128(x3, 4)); \ |
| x3 = _mm_xor_si128(x3, _mm_shuffle_epi32(x0, 255)); \ |
| kp[idx] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(x0), 68)); \ |
| kp[idx + 1] = \ |
| _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x3), 78)); \ |
| EXPAND_ASSIST(x0, x1, x2, x3, 85, (aes_const * 2)); \ |
| x3 = _mm_xor_si128(x3, _mm_slli_si128(x3, 4)); \ |
| x3 = _mm_xor_si128(x3, _mm_shuffle_epi32(x0, 255)); \ |
| kp[idx + 2] = x0; \ |
| tmp = x3 |
| |
| static void AES_128_Key_Expansion(const unsigned char* userkey, void* key) { |
| __m128i x0, x1, x2; |
| __m128i* kp = (__m128i*)key; |
| kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey); |
| x2 = _mm_setzero_si128(); |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 1); |
| kp[1] = x0; |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 2); |
| kp[2] = x0; |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 4); |
| kp[3] = x0; |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 8); |
| kp[4] = x0; |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 16); |
| kp[5] = x0; |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 32); |
| kp[6] = x0; |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 64); |
| kp[7] = x0; |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 128); |
| kp[8] = x0; |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 27); |
| kp[9] = x0; |
| EXPAND_ASSIST(x0, x1, x2, x0, 255, 54); |
| kp[10] = x0; |
| } |
| |
| static void AES_192_Key_Expansion(const unsigned char* userkey, void* key) { |
| __m128i x0, x1, x2, x3, tmp, *kp = (__m128i*)key; |
| kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey); |
| tmp = x3 = _mm_loadu_si128((__m128i*)(userkey + 16)); |
| x2 = _mm_setzero_si128(); |
| EXPAND192_STEP(1, 1); |
| EXPAND192_STEP(4, 4); |
| EXPAND192_STEP(7, 16); |
| EXPAND192_STEP(10, 64); |
| } |
| |
| static void AES_256_Key_Expansion(const unsigned char* userkey, void* key) { |
| __m128i x0, x1, x2, x3, *kp = (__m128i*)key; |
| kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey); |
| kp[1] = x3 = _mm_loadu_si128((__m128i*)(userkey + 16)); |
| x2 = _mm_setzero_si128(); |
| EXPAND_ASSIST(x0, x1, x2, x3, 255, 1); |
| kp[2] = x0; |
| EXPAND_ASSIST(x3, x1, x2, x0, 170, 1); |
| kp[3] = x3; |
| EXPAND_ASSIST(x0, x1, x2, x3, 255, 2); |
| kp[4] = x0; |
| EXPAND_ASSIST(x3, x1, x2, x0, 170, 2); |
| kp[5] = x3; |
| EXPAND_ASSIST(x0, x1, x2, x3, 255, 4); |
| kp[6] = x0; |
| EXPAND_ASSIST(x3, x1, x2, x0, 170, 4); |
| kp[7] = x3; |
| EXPAND_ASSIST(x0, x1, x2, x3, 255, 8); |
| kp[8] = x0; |
| EXPAND_ASSIST(x3, x1, x2, x0, 170, 8); |
| kp[9] = x3; |
| EXPAND_ASSIST(x0, x1, x2, x3, 255, 16); |
| kp[10] = x0; |
| EXPAND_ASSIST(x3, x1, x2, x0, 170, 16); |
| kp[11] = x3; |
| EXPAND_ASSIST(x0, x1, x2, x3, 255, 32); |
| kp[12] = x0; |
| EXPAND_ASSIST(x3, x1, x2, x0, 170, 32); |
| kp[13] = x3; |
| EXPAND_ASSIST(x0, x1, x2, x3, 255, 64); |
| kp[14] = x0; |
| } |
| |
| static int AES_set_encrypt_key(const unsigned char* userKey, const int bits, AES_KEY* key) { |
| if (bits == 128) { |
| AES_128_Key_Expansion(userKey, key); |
| } else if (bits == 192) { |
| AES_192_Key_Expansion(userKey, key); |
| } else if (bits == 256) { |
| AES_256_Key_Expansion(userKey, key); |
| } |
| #if (OCB_KEY_LEN == 0) |
| key->rounds = 6 + bits / 32; |
| #endif |
| return 0; |
| } |
| |
| static void AES_set_decrypt_key_fast(AES_KEY* dkey, const AES_KEY* ekey) { |
| int j = 0; |
| int i = ROUNDS(ekey); |
| #if (OCB_KEY_LEN == 0) |
| dkey->rounds = i; |
| #endif |
| dkey->rd_key[i--] = ekey->rd_key[j++]; |
| while (i) |
| dkey->rd_key[i--] = _mm_aesimc_si128(ekey->rd_key[j++]); |
| dkey->rd_key[i] = ekey->rd_key[j]; |
| } |
| |
| static int AES_set_decrypt_key(const unsigned char* userKey, const int bits, AES_KEY* key) { |
| AES_KEY temp_key; |
| AES_set_encrypt_key(userKey, bits, &temp_key); |
| AES_set_decrypt_key_fast(key, &temp_key); |
| return 0; |
| } |
| |
| static inline void AES_encrypt(const unsigned char* in, unsigned char* out, const AES_KEY* key) { |
| int j, rnds = ROUNDS(key); |
| const __m128i* sched = ((__m128i*)(key->rd_key)); |
| __m128i tmp = _mm_load_si128((__m128i*)in); |
| tmp = _mm_xor_si128(tmp, sched[0]); |
| for (j = 1; j < rnds; j++) |
| tmp = _mm_aesenc_si128(tmp, sched[j]); |
| tmp = _mm_aesenclast_si128(tmp, sched[j]); |
| _mm_store_si128((__m128i*)out, tmp); |
| } |
| |
| static inline void AES_decrypt(const unsigned char* in, unsigned char* out, const AES_KEY* key) { |
| int j, rnds = ROUNDS(key); |
| const __m128i* sched = ((__m128i*)(key->rd_key)); |
| __m128i tmp = _mm_load_si128((__m128i*)in); |
| tmp = _mm_xor_si128(tmp, sched[0]); |
| for (j = 1; j < rnds; j++) |
| tmp = _mm_aesdec_si128(tmp, sched[j]); |
| tmp = _mm_aesdeclast_si128(tmp, sched[j]); |
| _mm_store_si128((__m128i*)out, tmp); |
| } |
| |
| static inline void AES_ecb_encrypt_blks(block* blks, unsigned nblks, AES_KEY* key) { |
| unsigned i, j, rnds = ROUNDS(key); |
| const __m128i* sched = ((__m128i*)(key->rd_key)); |
| for (i = 0; i < nblks; ++i) |
| blks[i] = _mm_xor_si128(blks[i], sched[0]); |
| for (j = 1; j < rnds; ++j) |
| for (i = 0; i < nblks; ++i) |
| blks[i] = _mm_aesenc_si128(blks[i], sched[j]); |
| for (i = 0; i < nblks; ++i) |
| blks[i] = _mm_aesenclast_si128(blks[i], sched[j]); |
| } |
| |
| static inline void AES_ecb_decrypt_blks(block* blks, unsigned nblks, AES_KEY* key) { |
| unsigned i, j, rnds = ROUNDS(key); |
| const __m128i* sched = ((__m128i*)(key->rd_key)); |
| for (i = 0; i < nblks; ++i) |
| blks[i] = _mm_xor_si128(blks[i], sched[0]); |
| for (j = 1; j < rnds; ++j) |
| for (i = 0; i < nblks; ++i) |
| blks[i] = _mm_aesdec_si128(blks[i], sched[j]); |
| for (i = 0; i < nblks; ++i) |
| blks[i] = _mm_aesdeclast_si128(blks[i], sched[j]); |
| } |
| |
| #define BPI 8 /* Number of blocks in buffer per ECB call */ |
| /* Set to 4 for Westmere, 8 for Sandy Bridge */ |
| |
| #endif |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Define OCB context structure. */ |
| /* ----------------------------------------------------------------------- */ |
| |
| /*------------------------------------------------------------------------ |
| / Each item in the OCB context is stored either "memory correct" or |
| / "register correct". On big-endian machines, this is identical. On |
| / little-endian machines, one must choose whether the byte-string |
| / is in the correct order when it resides in memory or in registers. |
| / It must be register correct whenever it is to be manipulated |
| / arithmetically, but must be memory correct whenever it interacts |
| / with the plaintext or ciphertext. |
| /------------------------------------------------------------------------- */ |
| |
| struct _ae_ctx { |
| block offset; /* Memory correct */ |
| block checksum; /* Memory correct */ |
| block Lstar; /* Memory correct */ |
| block Ldollar; /* Memory correct */ |
| block L[L_TABLE_SZ]; /* Memory correct */ |
| block ad_checksum; /* Memory correct */ |
| block ad_offset; /* Memory correct */ |
| block cached_Top; /* Memory correct */ |
| uint64_t KtopStr[3]; /* Register correct, each item */ |
| uint32_t ad_blocks_processed; |
| uint32_t blocks_processed; |
| AES_KEY decrypt_key; |
| AES_KEY encrypt_key; |
| #if (OCB_TAG_LEN == 0) |
| unsigned tag_len; |
| #endif |
| }; |
| |
| /* ----------------------------------------------------------------------- */ |
| /* L table lookup (or on-the-fly generation) */ |
| /* ----------------------------------------------------------------------- */ |
| |
| #if L_TABLE_SZ_IS_ENOUGH |
| #define getL(_ctx, _tz) ((_ctx)->L[_tz]) |
| #else |
| static block getL(const ae_ctx* ctx, unsigned tz) { |
| if (tz < L_TABLE_SZ) |
| return ctx->L[tz]; |
| else { |
| unsigned i; |
| /* Bring L[MAX] into registers, make it register correct */ |
| block rval = swap_if_le(ctx->L[L_TABLE_SZ - 1]); |
| rval = double_block(rval); |
| for (i = L_TABLE_SZ; i < tz; i++) |
| rval = double_block(rval); |
| return swap_if_le(rval); /* To memory correct */ |
| } |
| } |
| #endif |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Public functions */ |
| /* ----------------------------------------------------------------------- */ |
| |
| /* 32-bit SSE2 and Altivec systems need to be forced to allocate memory |
| on 16-byte alignments. (I believe all major 64-bit systems do already.) */ |
| |
| ae_ctx* ae_allocate(void* misc) { |
| void* p; |
| (void)misc; /* misc unused in this implementation */ |
| #if (__SSE2__ && !_M_X64 && !_M_AMD64 && !__amd64__) |
| p = _mm_malloc(sizeof(ae_ctx), 16); |
| #elif(__ALTIVEC__ && !__PPC64__) |
| if (posix_memalign(&p, 16, sizeof(ae_ctx)) != 0) |
| p = NULL; |
| #else |
| p = malloc(sizeof(ae_ctx)); |
| #endif |
| return (ae_ctx*)p; |
| } |
| |
| void ae_free(ae_ctx* ctx) { |
| #if (__SSE2__ && !_M_X64 && !_M_AMD64 && !__amd64__) |
| _mm_free(ctx); |
| #else |
| free(ctx); |
| #endif |
| } |
| |
| /* ----------------------------------------------------------------------- */ |
| |
| int ae_clear(ae_ctx* ctx) /* Zero ae_ctx and undo initialization */ |
| { |
| memset(ctx, 0, sizeof(ae_ctx)); |
| return AE_SUCCESS; |
| } |
| |
| int ae_ctx_sizeof(void) { |
| return (int)sizeof(ae_ctx); |
| } |
| |
| /* ----------------------------------------------------------------------- */ |
| |
| int ae_init(ae_ctx* ctx, const void* key, int key_len, int nonce_len, int tag_len) { |
| unsigned i; |
| block tmp_blk; |
| |
| if (nonce_len != 12) |
| return AE_NOT_SUPPORTED; |
| |
| /* Initialize encryption & decryption keys */ |
| #if (OCB_KEY_LEN > 0) |
| key_len = OCB_KEY_LEN; |
| #endif |
| AES_set_encrypt_key((unsigned char*)key, key_len * 8, &ctx->encrypt_key); |
| #if USE_AES_NI |
| AES_set_decrypt_key_fast(&ctx->decrypt_key, &ctx->encrypt_key); |
| #else |
| AES_set_decrypt_key((unsigned char*)key, (int)(key_len * 8), &ctx->decrypt_key); |
| #endif |
| |
| /* Zero things that need zeroing */ |
| ctx->cached_Top = ctx->ad_checksum = zero_block(); |
| ctx->ad_blocks_processed = 0; |
| |
| /* Compute key-dependent values */ |
| AES_encrypt((unsigned char*)&ctx->cached_Top, (unsigned char*)&ctx->Lstar, &ctx->encrypt_key); |
| tmp_blk = swap_if_le(ctx->Lstar); |
| tmp_blk = double_block(tmp_blk); |
| ctx->Ldollar = swap_if_le(tmp_blk); |
| tmp_blk = double_block(tmp_blk); |
| ctx->L[0] = swap_if_le(tmp_blk); |
| for (i = 1; i < L_TABLE_SZ; i++) { |
| tmp_blk = double_block(tmp_blk); |
| ctx->L[i] = swap_if_le(tmp_blk); |
| } |
| |
| #if (OCB_TAG_LEN == 0) |
| ctx->tag_len = tag_len; |
| #else |
| (void)tag_len; /* Suppress var not used error */ |
| #endif |
| |
| return AE_SUCCESS; |
| } |
| |
| /* ----------------------------------------------------------------------- */ |
| |
| static block gen_offset_from_nonce(ae_ctx* ctx, const void* nonce) { |
| const union { |
| unsigned x; |
| unsigned char endian; |
| } little = {1}; |
| union { |
| uint32_t u32[4]; |
| uint8_t u8[16]; |
| block bl; |
| } tmp; |
| unsigned idx; |
| uint32_t tagadd; |
| |
| /* Replace cached nonce Top if needed */ |
| #if (OCB_TAG_LEN > 0) |
| if (little.endian) |
| tmp.u32[0] = 0x01000000 + ((OCB_TAG_LEN * 8 % 128) << 1); |
| else |
| tmp.u32[0] = 0x00000001 + ((OCB_TAG_LEN * 8 % 128) << 25); |
| #else |
| if (little.endian) |
| tmp.u32[0] = 0x01000000 + ((ctx->tag_len * 8 % 128) << 1); |
| else |
| tmp.u32[0] = 0x00000001 + ((ctx->tag_len * 8 % 128) << 25); |
| #endif |
| tmp.u32[1] = ((uint32_t*)nonce)[0]; |
| tmp.u32[2] = ((uint32_t*)nonce)[1]; |
| tmp.u32[3] = ((uint32_t*)nonce)[2]; |
| idx = (unsigned)(tmp.u8[15] & 0x3f); /* Get low 6 bits of nonce */ |
| tmp.u8[15] = tmp.u8[15] & 0xc0; /* Zero low 6 bits of nonce */ |
| if (unequal_blocks(tmp.bl, ctx->cached_Top)) { /* Cached? */ |
| ctx->cached_Top = tmp.bl; /* Update cache, KtopStr */ |
| AES_encrypt(tmp.u8, (unsigned char*)&ctx->KtopStr, &ctx->encrypt_key); |
| if (little.endian) { /* Make Register Correct */ |
| ctx->KtopStr[0] = bswap64(ctx->KtopStr[0]); |
| ctx->KtopStr[1] = bswap64(ctx->KtopStr[1]); |
| } |
| ctx->KtopStr[2] = ctx->KtopStr[0] ^ (ctx->KtopStr[0] << 8) ^ (ctx->KtopStr[1] >> 56); |
| } |
| return gen_offset(ctx->KtopStr, idx); |
| } |
| |
| static void process_ad(ae_ctx* ctx, const void* ad, int ad_len, int final) { |
| union { |
| uint32_t u32[4]; |
| uint8_t u8[16]; |
| block bl; |
| } tmp; |
| block ad_offset, ad_checksum; |
| const block* adp = (block*)ad; |
| unsigned i, k, tz, remaining; |
| |
| ad_offset = ctx->ad_offset; |
| ad_checksum = ctx->ad_checksum; |
| i = ad_len / (BPI * 16); |
| if (i) { |
| unsigned ad_block_num = ctx->ad_blocks_processed; |
| do { |
| block ta[BPI], oa[BPI]; |
| ad_block_num += BPI; |
| tz = ntz(ad_block_num); |
| oa[0] = xor_block(ad_offset, ctx->L[0]); |
| ta[0] = xor_block(oa[0], adp[0]); |
| oa[1] = xor_block(oa[0], ctx->L[1]); |
| ta[1] = xor_block(oa[1], adp[1]); |
| oa[2] = xor_block(ad_offset, ctx->L[1]); |
| ta[2] = xor_block(oa[2], adp[2]); |
| #if BPI == 4 |
| ad_offset = xor_block(oa[2], getL(ctx, tz)); |
| ta[3] = xor_block(ad_offset, adp[3]); |
| #elif BPI == 8 |
| oa[3] = xor_block(oa[2], ctx->L[2]); |
| ta[3] = xor_block(oa[3], adp[3]); |
| oa[4] = xor_block(oa[1], ctx->L[2]); |
| ta[4] = xor_block(oa[4], adp[4]); |
| oa[5] = xor_block(oa[0], ctx->L[2]); |
| ta[5] = xor_block(oa[5], adp[5]); |
| oa[6] = xor_block(ad_offset, ctx->L[2]); |
| ta[6] = xor_block(oa[6], adp[6]); |
| ad_offset = xor_block(oa[6], getL(ctx, tz)); |
| ta[7] = xor_block(ad_offset, adp[7]); |
| #endif |
| AES_ecb_encrypt_blks(ta, BPI, &ctx->encrypt_key); |
| ad_checksum = xor_block(ad_checksum, ta[0]); |
| ad_checksum = xor_block(ad_checksum, ta[1]); |
| ad_checksum = xor_block(ad_checksum, ta[2]); |
| ad_checksum = xor_block(ad_checksum, ta[3]); |
| #if (BPI == 8) |
| ad_checksum = xor_block(ad_checksum, ta[4]); |
| ad_checksum = xor_block(ad_checksum, ta[5]); |
| ad_checksum = xor_block(ad_checksum, ta[6]); |
| ad_checksum = xor_block(ad_checksum, ta[7]); |
| #endif |
| adp += BPI; |
| } while (--i); |
| ctx->ad_blocks_processed = ad_block_num; |
| ctx->ad_offset = ad_offset; |
| ctx->ad_checksum = ad_checksum; |
| } |
| |
| if (final) { |
| block ta[BPI]; |
| |
| /* Process remaining associated data, compute its tag contribution */ |
| remaining = ((unsigned)ad_len) % (BPI * 16); |
| if (remaining) { |
| k = 0; |
| #if (BPI == 8) |
| if (remaining >= 64) { |
| tmp.bl = xor_block(ad_offset, ctx->L[0]); |
| ta[0] = xor_block(tmp.bl, adp[0]); |
| tmp.bl = xor_block(tmp.bl, ctx->L[1]); |
| ta[1] = xor_block(tmp.bl, adp[1]); |
| ad_offset = xor_block(ad_offset, ctx->L[1]); |
| ta[2] = xor_block(ad_offset, adp[2]); |
| ad_offset = xor_block(ad_offset, ctx->L[2]); |
| ta[3] = xor_block(ad_offset, adp[3]); |
| remaining -= 64; |
| k = 4; |
| } |
| #endif |
| if (remaining >= 32) { |
| ad_offset = xor_block(ad_offset, ctx->L[0]); |
| ta[k] = xor_block(ad_offset, adp[k]); |
| ad_offset = xor_block(ad_offset, getL(ctx, ntz(k + 2))); |
| ta[k + 1] = xor_block(ad_offset, adp[k + 1]); |
| remaining -= 32; |
| k += 2; |
| } |
| if (remaining >= 16) { |
| ad_offset = xor_block(ad_offset, ctx->L[0]); |
| ta[k] = xor_block(ad_offset, adp[k]); |
| remaining = remaining - 16; |
| ++k; |
| } |
| if (remaining) { |
| ad_offset = xor_block(ad_offset, ctx->Lstar); |
| tmp.bl = zero_block(); |
| memcpy(tmp.u8, adp + k, remaining); |
| tmp.u8[remaining] = (unsigned char)0x80u; |
| ta[k] = xor_block(ad_offset, tmp.bl); |
| ++k; |
| } |
| AES_ecb_encrypt_blks(ta, k, &ctx->encrypt_key); |
| switch (k) { |
| #if (BPI == 8) |
| case 8: |
| ad_checksum = xor_block(ad_checksum, ta[7]); |
| case 7: |
| ad_checksum = xor_block(ad_checksum, ta[6]); |
| case 6: |
| ad_checksum = xor_block(ad_checksum, ta[5]); |
| case 5: |
| ad_checksum = xor_block(ad_checksum, ta[4]); |
| #endif |
| case 4: |
| ad_checksum = xor_block(ad_checksum, ta[3]); |
| case 3: |
| ad_checksum = xor_block(ad_checksum, ta[2]); |
| case 2: |
| ad_checksum = xor_block(ad_checksum, ta[1]); |
| case 1: |
| ad_checksum = xor_block(ad_checksum, ta[0]); |
| } |
| ctx->ad_checksum = ad_checksum; |
| } |
| } |
| } |
| |
| /* ----------------------------------------------------------------------- */ |
| |
| int ae_encrypt(ae_ctx* ctx, const void* nonce, const void* pt, int pt_len, const void* ad, |
| int ad_len, void* ct, void* tag, int final) { |
| union { |
| uint32_t u32[4]; |
| uint8_t u8[16]; |
| block bl; |
| } tmp; |
| block offset, checksum; |
| unsigned i, k; |
| block* ctp = (block*)ct; |
| const block* ptp = (block*)pt; |
| |
| /* Non-null nonce means start of new message, init per-message values */ |
| if (nonce) { |
| ctx->offset = gen_offset_from_nonce(ctx, nonce); |
| ctx->ad_offset = ctx->checksum = zero_block(); |
| ctx->ad_blocks_processed = ctx->blocks_processed = 0; |
| if (ad_len >= 0) |
| ctx->ad_checksum = zero_block(); |
| } |
| |
| /* Process associated data */ |
| if (ad_len > 0) |
| process_ad(ctx, ad, ad_len, final); |
| |
| /* Encrypt plaintext data BPI blocks at a time */ |
| offset = ctx->offset; |
| checksum = ctx->checksum; |
| i = pt_len / (BPI * 16); |
| if (i) { |
| block oa[BPI]; |
| unsigned block_num = ctx->blocks_processed; |
| oa[BPI - 1] = offset; |
| do { |
| block ta[BPI]; |
| block_num += BPI; |
| oa[0] = xor_block(oa[BPI - 1], ctx->L[0]); |
| ta[0] = xor_block(oa[0], ptp[0]); |
| checksum = xor_block(checksum, ptp[0]); |
| oa[1] = xor_block(oa[0], ctx->L[1]); |
| ta[1] = xor_block(oa[1], ptp[1]); |
| checksum = xor_block(checksum, ptp[1]); |
| oa[2] = xor_block(oa[1], ctx->L[0]); |
| ta[2] = xor_block(oa[2], ptp[2]); |
| checksum = xor_block(checksum, ptp[2]); |
| #if BPI == 4 |
| oa[3] = xor_block(oa[2], getL(ctx, ntz(block_num))); |
| ta[3] = xor_block(oa[3], ptp[3]); |
| checksum = xor_block(checksum, ptp[3]); |
| #elif BPI == 8 |
| oa[3] = xor_block(oa[2], ctx->L[2]); |
| ta[3] = xor_block(oa[3], ptp[3]); |
| checksum = xor_block(checksum, ptp[3]); |
| oa[4] = xor_block(oa[1], ctx->L[2]); |
| ta[4] = xor_block(oa[4], ptp[4]); |
| checksum = xor_block(checksum, ptp[4]); |
| oa[5] = xor_block(oa[0], ctx->L[2]); |
| ta[5] = xor_block(oa[5], ptp[5]); |
| checksum = xor_block(checksum, ptp[5]); |
| oa[6] = xor_block(oa[7], ctx->L[2]); |
| ta[6] = xor_block(oa[6], ptp[6]); |
| checksum = xor_block(checksum, ptp[6]); |
| oa[7] = xor_block(oa[6], getL(ctx, ntz(block_num))); |
| ta[7] = xor_block(oa[7], ptp[7]); |
| checksum = xor_block(checksum, ptp[7]); |
| #endif |
| AES_ecb_encrypt_blks(ta, BPI, &ctx->encrypt_key); |
| ctp[0] = xor_block(ta[0], oa[0]); |
| ctp[1] = xor_block(ta[1], oa[1]); |
| ctp[2] = xor_block(ta[2], oa[2]); |
| ctp[3] = xor_block(ta[3], oa[3]); |
| #if (BPI == 8) |
| ctp[4] = xor_block(ta[4], oa[4]); |
| ctp[5] = xor_block(ta[5], oa[5]); |
| ctp[6] = xor_block(ta[6], oa[6]); |
| ctp[7] = xor_block(ta[7], oa[7]); |
| #endif |
| ptp += BPI; |
| ctp += BPI; |
| } while (--i); |
| ctx->offset = offset = oa[BPI - 1]; |
| ctx->blocks_processed = block_num; |
| ctx->checksum = checksum; |
| } |
| |
| if (final) { |
| block ta[BPI + 1], oa[BPI]; |
| |
| /* Process remaining plaintext and compute its tag contribution */ |
| unsigned remaining = ((unsigned)pt_len) % (BPI * 16); |
| k = 0; /* How many blocks in ta[] need ECBing */ |
| if (remaining) { |
| #if (BPI == 8) |
| if (remaining >= 64) { |
| oa[0] = xor_block(offset, ctx->L[0]); |
| ta[0] = xor_block(oa[0], ptp[0]); |
| checksum = xor_block(checksum, ptp[0]); |
| oa[1] = xor_block(oa[0], ctx->L[1]); |
| ta[1] = xor_block(oa[1], ptp[1]); |
| checksum = xor_block(checksum, ptp[1]); |
| oa[2] = xor_block(oa[1], ctx->L[0]); |
| ta[2] = xor_block(oa[2], ptp[2]); |
| checksum = xor_block(checksum, ptp[2]); |
| offset = oa[3] = xor_block(oa[2], ctx->L[2]); |
| ta[3] = xor_block(offset, ptp[3]); |
| checksum = xor_block(checksum, ptp[3]); |
| remaining -= 64; |
| k = 4; |
| } |
| #endif |
| if (remaining >= 32) { |
| oa[k] = xor_block(offset, ctx->L[0]); |
| ta[k] = xor_block(oa[k], ptp[k]); |
| checksum = xor_block(checksum, ptp[k]); |
| offset = oa[k + 1] = xor_block(oa[k], ctx->L[1]); |
| ta[k + 1] = xor_block(offset, ptp[k + 1]); |
| checksum = xor_block(checksum, ptp[k + 1]); |
| remaining -= 32; |
| k += 2; |
| } |
| if (remaining >= 16) { |
| offset = oa[k] = xor_block(offset, ctx->L[0]); |
| ta[k] = xor_block(offset, ptp[k]); |
| checksum = xor_block(checksum, ptp[k]); |
| remaining -= 16; |
| ++k; |
| } |
| if (remaining) { |
| tmp.bl = zero_block(); |
| memcpy(tmp.u8, ptp + k, remaining); |
| tmp.u8[remaining] = (unsigned char)0x80u; |
| checksum = xor_block(checksum, tmp.bl); |
| ta[k] = offset = xor_block(offset, ctx->Lstar); |
| ++k; |
| } |
| } |
| offset = xor_block(offset, ctx->Ldollar); /* Part of tag gen */ |
| ta[k] = xor_block(offset, checksum); /* Part of tag gen */ |
| AES_ecb_encrypt_blks(ta, k + 1, &ctx->encrypt_key); |
| offset = xor_block(ta[k], ctx->ad_checksum); /* Part of tag gen */ |
| if (remaining) { |
| --k; |
| tmp.bl = xor_block(tmp.bl, ta[k]); |
| memcpy(ctp + k, tmp.u8, remaining); |
| } |
| switch (k) { |
| #if (BPI == 8) |
| case 7: |
| ctp[6] = xor_block(ta[6], oa[6]); |
| case 6: |
| ctp[5] = xor_block(ta[5], oa[5]); |
| case 5: |
| ctp[4] = xor_block(ta[4], oa[4]); |
| case 4: |
| ctp[3] = xor_block(ta[3], oa[3]); |
| #endif |
| case 3: |
| ctp[2] = xor_block(ta[2], oa[2]); |
| case 2: |
| ctp[1] = xor_block(ta[1], oa[1]); |
| case 1: |
| ctp[0] = xor_block(ta[0], oa[0]); |
| } |
| |
| /* Tag is placed at the correct location |
| */ |
| if (tag) { |
| #if (OCB_TAG_LEN == 16) |
| *(block*)tag = offset; |
| #elif(OCB_TAG_LEN > 0) |
| memcpy((char*)tag, &offset, OCB_TAG_LEN); |
| #else |
| memcpy((char*)tag, &offset, ctx->tag_len); |
| #endif |
| } else { |
| #if (OCB_TAG_LEN > 0) |
| memcpy((char*)ct + pt_len, &offset, OCB_TAG_LEN); |
| pt_len += OCB_TAG_LEN; |
| #else |
| memcpy((char*)ct + pt_len, &offset, ctx->tag_len); |
| pt_len += ctx->tag_len; |
| #endif |
| } |
| } |
| return (int)pt_len; |
| } |
| |
| /* ----------------------------------------------------------------------- */ |
| |
| /* Compare two regions of memory, taking a constant amount of time for a |
| given buffer size -- under certain assumptions about the compiler |
| and machine, of course. |
| |
| Use this to avoid timing side-channel attacks. |
| |
| Returns 0 for memory regions with equal contents; non-zero otherwise. */ |
| static int constant_time_memcmp(const void* av, const void* bv, size_t n) { |
| const uint8_t* a = (const uint8_t*)av; |
| const uint8_t* b = (const uint8_t*)bv; |
| uint8_t result = 0; |
| size_t i; |
| |
| for (i = 0; i < n; i++) { |
| result |= *a ^ *b; |
| a++; |
| b++; |
| } |
| |
| return (int)result; |
| } |
| |
| int ae_decrypt(ae_ctx* ctx, const void* nonce, const void* ct, int ct_len, const void* ad, |
| int ad_len, void* pt, const void* tag, int final) { |
| union { |
| uint32_t u32[4]; |
| uint8_t u8[16]; |
| block bl; |
| } tmp; |
| block offset, checksum; |
| unsigned i, k; |
| block* ctp = (block*)ct; |
| block* ptp = (block*)pt; |
| |
| /* Reduce ct_len tag bundled in ct */ |
| if ((final) && (!tag)) |
| #if (OCB_TAG_LEN > 0) |
| ct_len -= OCB_TAG_LEN; |
| #else |
| ct_len -= ctx->tag_len; |
| #endif |
| |
| /* Non-null nonce means start of new message, init per-message values */ |
| if (nonce) { |
| ctx->offset = gen_offset_from_nonce(ctx, nonce); |
| ctx->ad_offset = ctx->checksum = zero_block(); |
| ctx->ad_blocks_processed = ctx->blocks_processed = 0; |
| if (ad_len >= 0) |
| ctx->ad_checksum = zero_block(); |
| } |
| |
| /* Process associated data */ |
| if (ad_len > 0) |
| process_ad(ctx, ad, ad_len, final); |
| |
| /* Encrypt plaintext data BPI blocks at a time */ |
| offset = ctx->offset; |
| checksum = ctx->checksum; |
| i = ct_len / (BPI * 16); |
| if (i) { |
| block oa[BPI]; |
| unsigned block_num = ctx->blocks_processed; |
| oa[BPI - 1] = offset; |
| do { |
| block ta[BPI]; |
| block_num += BPI; |
| oa[0] = xor_block(oa[BPI - 1], ctx->L[0]); |
| ta[0] = xor_block(oa[0], ctp[0]); |
| oa[1] = xor_block(oa[0], ctx->L[1]); |
| ta[1] = xor_block(oa[1], ctp[1]); |
| oa[2] = xor_block(oa[1], ctx->L[0]); |
| ta[2] = xor_block(oa[2], ctp[2]); |
| #if BPI == 4 |
| oa[3] = xor_block(oa[2], getL(ctx, ntz(block_num))); |
| ta[3] = xor_block(oa[3], ctp[3]); |
| #elif BPI == 8 |
| oa[3] = xor_block(oa[2], ctx->L[2]); |
| ta[3] = xor_block(oa[3], ctp[3]); |
| oa[4] = xor_block(oa[1], ctx->L[2]); |
| ta[4] = xor_block(oa[4], ctp[4]); |
| oa[5] = xor_block(oa[0], ctx->L[2]); |
| ta[5] = xor_block(oa[5], ctp[5]); |
| oa[6] = xor_block(oa[7], ctx->L[2]); |
| ta[6] = xor_block(oa[6], ctp[6]); |
| oa[7] = xor_block(oa[6], getL(ctx, ntz(block_num))); |
| ta[7] = xor_block(oa[7], ctp[7]); |
| #endif |
| AES_ecb_decrypt_blks(ta, BPI, &ctx->decrypt_key); |
| ptp[0] = xor_block(ta[0], oa[0]); |
| checksum = xor_block(checksum, ptp[0]); |
| ptp[1] = xor_block(ta[1], oa[1]); |
| checksum = xor_block(checksum, ptp[1]); |
| ptp[2] = xor_block(ta[2], oa[2]); |
| checksum = xor_block(checksum, ptp[2]); |
| ptp[3] = xor_block(ta[3], oa[3]); |
| checksum = xor_block(checksum, ptp[3]); |
| #if (BPI == 8) |
| ptp[4] = xor_block(ta[4], oa[4]); |
| checksum = xor_block(checksum, ptp[4]); |
| ptp[5] = xor_block(ta[5], oa[5]); |
| checksum = xor_block(checksum, ptp[5]); |
| ptp[6] = xor_block(ta[6], oa[6]); |
| checksum = xor_block(checksum, ptp[6]); |
| ptp[7] = xor_block(ta[7], oa[7]); |
| checksum = xor_block(checksum, ptp[7]); |
| #endif |
| ptp += BPI; |
| ctp += BPI; |
| } while (--i); |
| ctx->offset = offset = oa[BPI - 1]; |
| ctx->blocks_processed = block_num; |
| ctx->checksum = checksum; |
| } |
| |
| if (final) { |
| block ta[BPI + 1], oa[BPI]; |
| |
| /* Process remaining plaintext and compute its tag contribution */ |
| unsigned remaining = ((unsigned)ct_len) % (BPI * 16); |
| k = 0; /* How many blocks in ta[] need ECBing */ |
| if (remaining) { |
| #if (BPI == 8) |
| if (remaining >= 64) { |
| oa[0] = xor_block(offset, ctx->L[0]); |
| ta[0] = xor_block(oa[0], ctp[0]); |
| oa[1] = xor_block(oa[0], ctx->L[1]); |
| ta[1] = xor_block(oa[1], ctp[1]); |
| oa[2] = xor_block(oa[1], ctx->L[0]); |
| ta[2] = xor_block(oa[2], ctp[2]); |
| offset = oa[3] = xor_block(oa[2], ctx->L[2]); |
| ta[3] = xor_block(offset, ctp[3]); |
| remaining -= 64; |
| k = 4; |
| } |
| #endif |
| if (remaining >= 32) { |
| oa[k] = xor_block(offset, ctx->L[0]); |
| ta[k] = xor_block(oa[k], ctp[k]); |
| offset = oa[k + 1] = xor_block(oa[k], ctx->L[1]); |
| ta[k + 1] = xor_block(offset, ctp[k + 1]); |
| remaining -= 32; |
| k += 2; |
| } |
| if (remaining >= 16) { |
| offset = oa[k] = xor_block(offset, ctx->L[0]); |
| ta[k] = xor_block(offset, ctp[k]); |
| remaining -= 16; |
| ++k; |
| } |
| if (remaining) { |
| block pad; |
| offset = xor_block(offset, ctx->Lstar); |
| AES_encrypt((unsigned char*)&offset, tmp.u8, &ctx->encrypt_key); |
| pad = tmp.bl; |
| memcpy(tmp.u8, ctp + k, remaining); |
| tmp.bl = xor_block(tmp.bl, pad); |
| tmp.u8[remaining] = (unsigned char)0x80u; |
| memcpy(ptp + k, tmp.u8, remaining); |
| checksum = xor_block(checksum, tmp.bl); |
| } |
| } |
| AES_ecb_decrypt_blks(ta, k, &ctx->decrypt_key); |
| switch (k) { |
| #if (BPI == 8) |
| case 7: |
| ptp[6] = xor_block(ta[6], oa[6]); |
| checksum = xor_block(checksum, ptp[6]); |
| case 6: |
| ptp[5] = xor_block(ta[5], oa[5]); |
| checksum = xor_block(checksum, ptp[5]); |
| case 5: |
| ptp[4] = xor_block(ta[4], oa[4]); |
| checksum = xor_block(checksum, ptp[4]); |
| case 4: |
| ptp[3] = xor_block(ta[3], oa[3]); |
| checksum = xor_block(checksum, ptp[3]); |
| #endif |
| case 3: |
| ptp[2] = xor_block(ta[2], oa[2]); |
| checksum = xor_block(checksum, ptp[2]); |
| case 2: |
| ptp[1] = xor_block(ta[1], oa[1]); |
| checksum = xor_block(checksum, ptp[1]); |
| case 1: |
| ptp[0] = xor_block(ta[0], oa[0]); |
| checksum = xor_block(checksum, ptp[0]); |
| } |
| |
| /* Calculate expected tag */ |
| offset = xor_block(offset, ctx->Ldollar); |
| tmp.bl = xor_block(offset, checksum); |
| AES_encrypt(tmp.u8, tmp.u8, &ctx->encrypt_key); |
| tmp.bl = xor_block(tmp.bl, ctx->ad_checksum); /* Full tag */ |
| |
| /* Compare with proposed tag, change ct_len if invalid */ |
| if ((OCB_TAG_LEN == 16) && tag) { |
| if (unequal_blocks(tmp.bl, *(block*)tag)) |
| ct_len = AE_INVALID; |
| } else { |
| #if (OCB_TAG_LEN > 0) |
| int len = OCB_TAG_LEN; |
| #else |
| int len = ctx->tag_len; |
| #endif |
| if (tag) { |
| if (constant_time_memcmp(tag, tmp.u8, len) != 0) |
| ct_len = AE_INVALID; |
| } else { |
| if (constant_time_memcmp((char*)ct + ct_len, tmp.u8, len) != 0) |
| ct_len = AE_INVALID; |
| } |
| } |
| } |
| return ct_len; |
| } |
| |
| /* ----------------------------------------------------------------------- */ |
| /* Simple test program */ |
| /* ----------------------------------------------------------------------- */ |
| |
| #if 0 |
| |
| #include <stdio.h> |
| #include <time.h> |
| |
| #if __GNUC__ |
| #define ALIGN(n) __attribute__((aligned(n))) |
| #elif _MSC_VER |
| #define ALIGN(n) __declspec(align(n)) |
| #else /* Not GNU/Microsoft: delete alignment uses. */ |
| #define ALIGN(n) |
| #endif |
| |
| static void pbuf(void *p, unsigned len, const void *s) |
| { |
| unsigned i; |
| if (s) |
| printf("%s", (char *)s); |
| for (i = 0; i < len; i++) |
| printf("%02X", (unsigned)(((unsigned char *)p)[i])); |
| printf("\n"); |
| } |
| |
| static void vectors(ae_ctx *ctx, int len) |
| { |
| ALIGN(16) char pt[128]; |
| ALIGN(16) char ct[144]; |
| ALIGN(16) char nonce[] = {0,1,2,3,4,5,6,7,8,9,10,11}; |
| int i; |
| for (i=0; i < 128; i++) pt[i] = i; |
| i = ae_encrypt(ctx,nonce,pt,len,pt,len,ct,NULL,AE_FINALIZE); |
| printf("P=%d,A=%d: ",len,len); pbuf(ct, i, NULL); |
| i = ae_encrypt(ctx,nonce,pt,0,pt,len,ct,NULL,AE_FINALIZE); |
| printf("P=%d,A=%d: ",0,len); pbuf(ct, i, NULL); |
| i = ae_encrypt(ctx,nonce,pt,len,pt,0,ct,NULL,AE_FINALIZE); |
| printf("P=%d,A=%d: ",len,0); pbuf(ct, i, NULL); |
| } |
| |
| void validate() |
| { |
| ALIGN(16) char pt[1024]; |
| ALIGN(16) char ct[1024]; |
| ALIGN(16) char tag[16]; |
| ALIGN(16) char nonce[12] = {0,}; |
| ALIGN(16) char key[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; |
| ae_ctx ctx; |
| char *val_buf, *next; |
| int i, len; |
| |
| val_buf = (char *)malloc(22400 + 16); |
| next = val_buf = (char *)(((size_t)val_buf + 16) & ~((size_t)15)); |
| |
| if (0) { |
| ae_init(&ctx, key, 16, 12, 16); |
| /* pbuf(&ctx, sizeof(ctx), "CTX: "); */ |
| vectors(&ctx,0); |
| vectors(&ctx,8); |
| vectors(&ctx,16); |
| vectors(&ctx,24); |
| vectors(&ctx,32); |
| vectors(&ctx,40); |
| } |
| |
| memset(key,0,32); |
| memset(pt,0,128); |
| ae_init(&ctx, key, OCB_KEY_LEN, 12, OCB_TAG_LEN); |
| |
| /* RFC Vector test */ |
| for (i = 0; i < 128; i++) { |
| int first = ((i/3)/(BPI*16))*(BPI*16); |
| int second = first; |
| int third = i - (first + second); |
| |
| nonce[11] = i; |
| |
| if (0) { |
| ae_encrypt(&ctx,nonce,pt,i,pt,i,ct,NULL,AE_FINALIZE); |
| memcpy(next,ct,(size_t)i+OCB_TAG_LEN); |
| next = next+i+OCB_TAG_LEN; |
| |
| ae_encrypt(&ctx,nonce,pt,i,pt,0,ct,NULL,AE_FINALIZE); |
| memcpy(next,ct,(size_t)i+OCB_TAG_LEN); |
| next = next+i+OCB_TAG_LEN; |
| |
| ae_encrypt(&ctx,nonce,pt,0,pt,i,ct,NULL,AE_FINALIZE); |
| memcpy(next,ct,OCB_TAG_LEN); |
| next = next+OCB_TAG_LEN; |
| } else { |
| ae_encrypt(&ctx,nonce,pt,first,pt,first,ct,NULL,AE_PENDING); |
| ae_encrypt(&ctx,NULL,pt+first,second,pt+first,second,ct+first,NULL,AE_PENDING); |
| ae_encrypt(&ctx,NULL,pt+first+second,third,pt+first+second,third,ct+first+second,NULL,AE_FINALIZE); |
| memcpy(next,ct,(size_t)i+OCB_TAG_LEN); |
| next = next+i+OCB_TAG_LEN; |
| |
| ae_encrypt(&ctx,nonce,pt,first,pt,0,ct,NULL,AE_PENDING); |
| ae_encrypt(&ctx,NULL,pt+first,second,pt,0,ct+first,NULL,AE_PENDING); |
| ae_encrypt(&ctx,NULL,pt+first+second,third,pt,0,ct+first+second,NULL,AE_FINALIZE); |
| memcpy(next,ct,(size_t)i+OCB_TAG_LEN); |
| next = next+i+OCB_TAG_LEN; |
| |
| ae_encrypt(&ctx,nonce,pt,0,pt,first,ct,NULL,AE_PENDING); |
| ae_encrypt(&ctx,NULL,pt,0,pt+first,second,ct,NULL,AE_PENDING); |
| ae_encrypt(&ctx,NULL,pt,0,pt+first+second,third,ct,NULL,AE_FINALIZE); |
| memcpy(next,ct,OCB_TAG_LEN); |
| next = next+OCB_TAG_LEN; |
| } |
| |
| } |
| nonce[11] = 0; |
| ae_encrypt(&ctx,nonce,NULL,0,val_buf,next-val_buf,ct,tag,AE_FINALIZE); |
| pbuf(tag,OCB_TAG_LEN,0); |
| |
| |
| /* Encrypt/Decrypt test */ |
| for (i = 0; i < 128; i++) { |
| int first = ((i/3)/(BPI*16))*(BPI*16); |
| int second = first; |
| int third = i - (first + second); |
| |
| nonce[11] = i%128; |
| |
| if (1) { |
| len = ae_encrypt(&ctx,nonce,val_buf,i,val_buf,i,ct,tag,AE_FINALIZE); |
| len = ae_encrypt(&ctx,nonce,val_buf,i,val_buf,-1,ct,tag,AE_FINALIZE); |
| len = ae_decrypt(&ctx,nonce,ct,len,val_buf,-1,pt,tag,AE_FINALIZE); |
| if (len == -1) { printf("Authentication error: %d\n", i); return; } |
| if (len != i) { printf("Length error: %d\n", i); return; } |
| if (memcmp(val_buf,pt,i)) { printf("Decrypt error: %d\n", i); return; } |
| } else { |
| len = ae_encrypt(&ctx,nonce,val_buf,i,val_buf,i,ct,NULL,AE_FINALIZE); |
| ae_decrypt(&ctx,nonce,ct,first,val_buf,first,pt,NULL,AE_PENDING); |
| ae_decrypt(&ctx,NULL,ct+first,second,val_buf+first,second,pt+first,NULL,AE_PENDING); |
| len = ae_decrypt(&ctx,NULL,ct+first+second,len-(first+second),val_buf+first+second,third,pt+first+second,NULL,AE_FINALIZE); |
| if (len == -1) { printf("Authentication error: %d\n", i); return; } |
| if (memcmp(val_buf,pt,i)) { printf("Decrypt error: %d\n", i); return; } |
| } |
| |
| } |
| printf("Decrypt: PASS\n"); |
| } |
| |
| int main() |
| { |
| validate(); |
| return 0; |
| } |
| #endif |
| |
| #if USE_AES_NI |
| char infoString[] = "OCB3 (AES-NI)"; |
| #elif USE_REFERENCE_AES |
| char infoString[] = "OCB3 (Reference)"; |
| #elif USE_OPENSSL_AES |
| char infoString[] = "OCB3 (OpenSSL)"; |
| #endif |