blob: 03853a7ca46be08027d011c3af03ac75b74c14d5 [file] [log] [blame]
/*
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "macroAssembler_aarch64.hpp"
#include "memory/resourceArea.hpp"
#include "runtime/stubRoutines.hpp"
void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) {
Label L_doLast;
ld1(v0, T16B, from); // get 16 bytes of input
ld1(v5, T16B, post(key, 16));
rev32(v5, T16B, v5);
ld1(v1, v2, v3, v4, T16B, post(key, 64));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
rev32(v3, T16B, v3);
rev32(v4, T16B, v4);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
aesimc(v0, v0);
aesd(v0, v3);
aesimc(v0, v0);
aesd(v0, v4);
aesimc(v0, v0);
ld1(v1, v2, v3, v4, T16B, post(key, 64));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
rev32(v3, T16B, v3);
rev32(v4, T16B, v4);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
aesimc(v0, v0);
aesd(v0, v3);
aesimc(v0, v0);
aesd(v0, v4);
aesimc(v0, v0);
ld1(v1, v2, T16B, post(key, 32));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
cmpw(keylen, 44);
br(Assembler::EQ, L_doLast);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
aesimc(v0, v0);
ld1(v1, v2, T16B, post(key, 32));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
cmpw(keylen, 52);
br(Assembler::EQ, L_doLast);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
aesimc(v0, v0);
ld1(v1, v2, T16B, post(key, 32));
rev32(v1, T16B, v1);
rev32(v2, T16B, v2);
bind(L_doLast);
aesd(v0, v1);
aesimc(v0, v0);
aesd(v0, v2);
eor(v0, T16B, v0, v5);
st1(v0, T16B, to);
// Preserve the address of the start of the key
sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
}
// Load expanded key into v17..v31
void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) {
Label L_loadkeys_44, L_loadkeys_52;
cmpw(keylen, 52);
br(Assembler::LO, L_loadkeys_44);
br(Assembler::EQ, L_loadkeys_52);
ld1(v17, v18, T16B, post(key, 32));
rev32(v17, T16B, v17);
rev32(v18, T16B, v18);
bind(L_loadkeys_52);
ld1(v19, v20, T16B, post(key, 32));
rev32(v19, T16B, v19);
rev32(v20, T16B, v20);
bind(L_loadkeys_44);
ld1(v21, v22, v23, v24, T16B, post(key, 64));
rev32(v21, T16B, v21);
rev32(v22, T16B, v22);
rev32(v23, T16B, v23);
rev32(v24, T16B, v24);
ld1(v25, v26, v27, v28, T16B, post(key, 64));
rev32(v25, T16B, v25);
rev32(v26, T16B, v26);
rev32(v27, T16B, v27);
rev32(v28, T16B, v28);
ld1(v29, v30, v31, T16B, post(key, 48));
rev32(v29, T16B, v29);
rev32(v30, T16B, v30);
rev32(v31, T16B, v31);
// Preserve the address of the start of the key
sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
}
// NeoverseTM N1Software Optimization Guide:
// Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC
// instruction pairs will exhibit the performance characteristics
// described in Section 4.6.
void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) {
aese(input, subkey); aesmc(input, input);
}
// KernelGenerator
//
// The abstract base class of an unrolled function generator.
// Subclasses override generate(), length(), and next() to generate
// unrolled and interleaved functions.
//
// The core idea is that a subclass defines a method which generates
// the base case of a function and a method to generate a clone of it,
// shifted to a different set of registers. KernelGenerator will then
// generate several interleaved copies of the function, with each one
// using a different set of registers.
// The subclass must implement three methods: length(), which is the
// number of instruction bundles in the intrinsic, generate(int n)
// which emits the nth instruction bundle in the intrinsic, and next()
// which takes an instance of the generator and returns a version of it,
// shifted to a new set of registers.
class KernelGenerator: public MacroAssembler {
protected:
const int _unrolls;
public:
KernelGenerator(Assembler *as, int unrolls)
: MacroAssembler(as->code()), _unrolls(unrolls) { }
virtual void generate(int index) = 0;
virtual int length() = 0;
virtual KernelGenerator *next() = 0;
int unrolls() { return _unrolls; }
void unroll();
};
void KernelGenerator::unroll() {
ResourceMark rm;
KernelGenerator **generators
= NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls());
generators[0] = this;
for (int i = 1; i < unrolls(); i++) {
generators[i] = generators[i-1]->next();
}
for (int j = 0; j < length(); j++) {
for (int i = 0; i < unrolls(); i++) {
generators[i]->generate(j);
}
}
}
// An unrolled and interleaved generator for AES encryption.
class AESKernelGenerator: public KernelGenerator {
Register _from, _to;
const Register _keylen;
FloatRegister _data;
const FloatRegister _subkeys;
bool _once;
Label _rounds_44, _rounds_52;
public:
AESKernelGenerator(Assembler *as, int unrolls,
Register from, Register to, Register keylen, FloatRegister data,
FloatRegister subkeys, bool once = true)
: KernelGenerator(as, unrolls),
_from(from), _to(to), _keylen(keylen), _data(data),
_subkeys(subkeys), _once(once) {
}
virtual void generate(int index) {
switch (index) {
case 0:
if (_from != noreg) {
ld1(_data, T16B, _from); // get 16 bytes of input
}
break;
case 1:
if (_once) {
cmpw(_keylen, 52);
br(Assembler::LO, _rounds_44);
br(Assembler::EQ, _rounds_52);
}
break;
case 2: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 0)); break;
case 3: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 1)); break;
case 4:
if (_once) bind(_rounds_52);
break;
case 5: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 2)); break;
case 6: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 3)); break;
case 7:
if (_once) bind(_rounds_44);
break;
case 8: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 4)); break;
case 9: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 5)); break;
case 10: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 6)); break;
case 11: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 7)); break;
case 12: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 8)); break;
case 13: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 9)); break;
case 14: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 10)); break;
case 15: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 11)); break;
case 16: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 12)); break;
case 17: aese(_data, as_FloatRegister(_subkeys->encoding() + 13)); break;
case 18: eor(_data, T16B, _data, as_FloatRegister(_subkeys->encoding() + 14)); break;
case 19:
if (_to != noreg) {
st1(_data, T16B, _to);
}
break;
default: ShouldNotReachHere();
}
}
virtual KernelGenerator *next() {
return new AESKernelGenerator(this, _unrolls,
_from, _to, _keylen,
_data->successor(), _subkeys, /*once*/false);
}
virtual int length() { return 20; }
};
// Uses expanded key in v17..v31
// Returns encrypted values in inputs.
// If to != noreg, store value at to; likewise from
// Preserves key, keylen
// Increments from, to
// Input data in v0, v1, ...
// unrolls controls the number of times to unroll the generated function
void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen,
FloatRegister data, int unrolls) {
AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll();
}
// ghash_multiply and ghash_reduce are the non-unrolled versions of
// the GHASH function generators.
void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) {
// Karatsuba multiplication performs a 128*128 -> 256-bit
// multiplication in three 128-bit multiplications and a few
// additions.
//
// (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
// (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
//
// Inputs:
//
// A0 in a.d[0] (subkey)
// A1 in a.d[1]
// (A1+A0) in a1_xor_a0.d[0]
//
// B0 in b.d[0] (state)
// B1 in b.d[1]
ext(tmp1, T16B, b, b, 0x08);
pmull2(result_hi, T1Q, b, a, T2D); // A1*B1
eor(tmp1, T16B, tmp1, b); // (B1+B0)
pmull(result_lo, T1Q, b, a, T1D); // A0*B0
pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0)
ext(tmp1, T16B, result_lo, result_hi, 0x08);
eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0
eor(tmp2, T16B, tmp2, tmp1);
eor(tmp2, T16B, tmp2, tmp3);
// Register pair <result_hi:result_lo> holds the result of carry-less multiplication
ins(result_hi, D, tmp2, 0, 1);
ins(result_lo, D, tmp2, 1, 0);
}
void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
FloatRegister p, FloatRegister vzr, FloatRegister t1) {
const FloatRegister t0 = result;
// The GCM field polynomial f is z^128 + p(z), where p =
// z^7+z^2+z+1.
//
// z^128 === -p(z) (mod (z^128 + p(z)))
//
// so, given that the product we're reducing is
// a == lo + hi * z^128
// substituting,
// === lo - hi * p(z) (mod (z^128 + p(z)))
//
// we reduce by multiplying hi by p(z) and subtracting the result
// from (i.e. XORing it with) lo. Because p has no nonzero high
// bits we can do this with two 64-bit multiplications, lo*p and
// hi*p.
pmull2(t0, T1Q, hi, p, T2D);
ext(t1, T16B, t0, vzr, 8);
eor(hi, T16B, hi, t1);
ext(t1, T16B, vzr, t0, 8);
eor(lo, T16B, lo, t1);
pmull(t0, T1Q, hi, p, T1D);
eor(result, T16B, lo, t0);
}
class GHASHMultiplyGenerator: public KernelGenerator {
FloatRegister _result_lo, _result_hi, _b,
_a, _vzr, _a1_xor_a0, _p,
_tmp1, _tmp2, _tmp3;
public:
GHASHMultiplyGenerator(Assembler *as, int unrolls,
/* offsetted registers */
FloatRegister result_lo, FloatRegister result_hi,
FloatRegister b,
/* non-offsetted (shared) registers */
FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr,
/* offsetted (temp) registers */
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3)
: KernelGenerator(as, unrolls),
_result_lo(result_lo), _result_hi(result_hi), _b(b),
_a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p),
_tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { }
int register_stride = 7;
virtual void generate(int index) {
// Karatsuba multiplication performs a 128*128 -> 256-bit
// multiplication in three 128-bit multiplications and a few
// additions.
//
// (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
// (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
//
// Inputs:
//
// A0 in a.d[0] (subkey)
// A1 in a.d[1]
// (A1+A0) in a1_xor_a0.d[0]
//
// B0 in b.d[0] (state)
// B1 in b.d[1]
switch (index) {
case 0: ext(_tmp1, T16B, _b, _b, 0x08); break;
case 1: pmull2(_result_hi, T1Q, _b, _a, T2D); // A1*B1
break;
case 2: eor(_tmp1, T16B, _tmp1, _b); // (B1+B0)
break;
case 3: pmull(_result_lo, T1Q, _b, _a, T1D); // A0*B0
break;
case 4: pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0)
break;
case 5: ext(_tmp1, T16B, _result_lo, _result_hi, 0x08); break;
case 6: eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0
break;
case 7: eor(_tmp2, T16B, _tmp2, _tmp1); break;
case 8: eor(_tmp2, T16B, _tmp2, _tmp3); break;
// Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication
case 9: ins(_result_hi, D, _tmp2, 0, 1); break;
case 10: ins(_result_lo, D, _tmp2, 1, 0); break;
default: ShouldNotReachHere();
}
}
virtual KernelGenerator* next() {
GHASHMultiplyGenerator* result = new GHASHMultiplyGenerator(*this);
result->_result_lo = as_FloatRegister(result->_result_lo->encoding() + register_stride);
result->_result_hi = as_FloatRegister(result->_result_hi->encoding() + register_stride);
result->_b = as_FloatRegister(result->_b ->encoding() + register_stride);
result->_tmp1 = as_FloatRegister(result->_tmp1 ->encoding() + register_stride);
result->_tmp2 = as_FloatRegister(result->_tmp2 ->encoding() + register_stride);
result->_tmp3 = as_FloatRegister(result->_tmp3 ->encoding() + register_stride);
return result;
}
virtual int length() { return 11; }
};
// Reduce the 128-bit product in hi:lo by the GCM field polynomial.
// The FloatRegister argument called data is optional: if it is a
// valid register, we interleave LD1 instructions with the
// reduction. This is to reduce latency next time around the loop.
class GHASHReduceGenerator: public KernelGenerator {
FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1;
int _once;
public:
GHASHReduceGenerator(Assembler *as, int unrolls,
/* offsetted registers */
FloatRegister result, FloatRegister lo, FloatRegister hi,
/* non-offsetted (shared) registers */
FloatRegister p, FloatRegister vzr, FloatRegister data,
/* offsetted (temp) registers */
FloatRegister t1)
: KernelGenerator(as, unrolls),
_result(result), _lo(lo), _hi(hi),
_p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { }
int register_stride = 7;
virtual void generate(int index) {
const FloatRegister t0 = _result;
switch (index) {
// The GCM field polynomial f is z^128 + p(z), where p =
// z^7+z^2+z+1.
//
// z^128 === -p(z) (mod (z^128 + p(z)))
//
// so, given that the product we're reducing is
// a == lo + hi * z^128
// substituting,
// === lo - hi * p(z) (mod (z^128 + p(z)))
//
// we reduce by multiplying hi by p(z) and subtracting the _result
// from (i.e. XORing it with) lo. Because p has no nonzero high
// bits we can do this with two 64-bit multiplications, lo*p and
// hi*p.
case 0: pmull2(t0, T1Q, _hi, _p, T2D); break;
case 1: ext(_t1, T16B, t0, _vzr, 8); break;
case 2: eor(_hi, T16B, _hi, _t1); break;
case 3: ext(_t1, T16B, _vzr, t0, 8); break;
case 4: eor(_lo, T16B, _lo, _t1); break;
case 5: pmull(t0, T1Q, _hi, _p, T1D); break;
case 6: eor(_result, T16B, _lo, t0); break;
default: ShouldNotReachHere();
}
// Sprinkle load instructions into the generated instructions
if (_data->is_valid() && _once) {
assert(length() >= unrolls(), "not enough room for inteleaved loads");
if (index < unrolls()) {
ld1(as_FloatRegister(_data->encoding() + index*register_stride), T16B, post(r2, 0x10));
}
}
}
virtual KernelGenerator *next() {
GHASHReduceGenerator *result = new GHASHReduceGenerator(*this);
result->_result = as_FloatRegister(result->_result->encoding() + register_stride);
result->_hi = as_FloatRegister(result->_hi ->encoding() + register_stride);
result->_lo = as_FloatRegister(result->_lo ->encoding() + register_stride);
result->_t1 = as_FloatRegister(result->_t1 ->encoding() + register_stride);
result->_once = false;
return result;
}
int length() { return 7; }
};
// Perform a GHASH multiply/reduce on a single FloatRegister.
void MacroAssembler::ghash_modmul(FloatRegister result,
FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
FloatRegister t1, FloatRegister t2, FloatRegister t3) {
ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3);
ghash_reduce(result, result_lo, result_hi, p, vzr, t1);
}
// Interleaved GHASH processing.
//
// Clobbers all vector registers.
//
void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
Register subkeyH,
Register data, Register blocks, int unrolls) {
int register_stride = 7;
// Bafflingly, GCM uses little-endian for the byte order, but
// big-endian for the bit order. For example, the polynomial 1 is
// represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
//
// So, we must either reverse the bytes in each word and do
// everything big-endian or reverse the bits in each byte and do
// it little-endian. On AArch64 it's more idiomatic to reverse
// the bits in each byte (we have an instruction, RBIT, to do
// that) and keep the data in little-endian bit order through the
// calculation, bit-reversing the inputs and outputs.
assert(unrolls * register_stride < 32, "out of registers");
FloatRegister a1_xor_a0 = v28;
FloatRegister Hprime = v29;
FloatRegister vzr = v30;
FloatRegister p = v31;
eor(vzr, T16B, vzr, vzr); // zero register
ldrq(p, field_polynomial); // The field polynomial
ldrq(v0, Address(state));
ldrq(Hprime, Address(subkeyH));
rev64(v0, T16B, v0); // Bit-reverse words in state and subkeyH
rbit(v0, T16B, v0);
rev64(Hprime, T16B, Hprime);
rbit(Hprime, T16B, Hprime);
// Powers of H -> Hprime
Label already_calculated, done;
{
// The first time around we'll have to calculate H**2, H**3, etc.
// Look at the largest power of H in the subkeyH array to see if
// it's already been calculated.
ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1)));
orr(rscratch1, rscratch1, rscratch2);
cbnz(rscratch1, already_calculated);
orr(v6, T16B, Hprime, Hprime); // Start with H in v6 and Hprime
for (int i = 1; i < unrolls; i++) {
ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6,
Hprime, vzr, a1_xor_a0, p,
/*temps*/v1, v3, v2);
rev64(v1, T16B, v6);
rbit(v1, T16B, v1);
strq(v1, Address(subkeyH, 16 * i));
}
b(done);
}
{
bind(already_calculated);
// Load the largest power of H we need into v6.
ldrq(v6, Address(subkeyH, 16 * (unrolls - 1)));
rev64(v6, T16B, v6);
rbit(v6, T16B, v6);
}
bind(done);
orr(Hprime, T16B, v6, v6); // Move H ** unrolls into Hprime
// Hprime contains (H ** 1, H ** 2, ... H ** unrolls)
// v0 contains the initial state. Clear the others.
for (int i = 1; i < unrolls; i++) {
int ofs = register_stride * i;
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
eor(v0_ofs, T16B, v0_ofs, v0_ofs); // zero each state register
}
ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
// Load #unrolls blocks of data
for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
ld1(v2_ofs, T16B, post(data, 0x10));
}
// Register assignments, replicated across 4 clones, v0 ... v23
//
// v0: input / output: current state, result of multiply/reduce
// v1: temp
// v2: input: one block of data (the ciphertext)
// also used as a temp once the data has been consumed
// v3: temp
// v4: output: high part of product
// v5: output: low part ...
// v6: unused
//
// Not replicated:
//
// v28: High part of H xor low part of H'
// v29: H' (hash subkey)
// v30: zero
// v31: Reduction polynomial of the Galois field
// Inner loop.
// Do the whole load/add/multiply/reduce over all our data except
// the last few rows.
{
Label L_ghash_loop;
bind(L_ghash_loop);
// Prefetching doesn't help here. In fact, on Neoverse N1 it's worse.
// prfm(Address(data, 128), PLDL1KEEP);
// Xor data into current state
for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
rbit(v2_ofs, T16B, v2_ofs);
eor(v2_ofs, T16B, v0_ofs, v2_ofs); // bit-swapped data ^ bit-swapped state
}
// Generate fully-unrolled multiply-reduce in two stages.
GHASHMultiplyGenerator(this, unrolls,
/*result_lo*/v5, /*result_hi*/v4, /*data*/v2,
Hprime, a1_xor_a0, p, vzr,
/*temps*/v1, v3, /* reuse b*/v2) .unroll();
// NB: GHASHReduceGenerator also loads the next #unrolls blocks of
// data into v0, v0+ofs, the current state.
GHASHReduceGenerator (this, unrolls,
/*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr,
/*data*/v2, /*temp*/v3) .unroll();
sub(blocks, blocks, unrolls);
cmp(blocks, (unsigned char)(unrolls * 2));
br(GE, L_ghash_loop);
}
// Merge the #unrolls states. Note that the data for the next
// iteration has already been loaded into v4, v4+ofs, etc...
// First, we multiply/reduce each clone by the appropriate power of H.
for (int i = 0; i < unrolls; i++) {
int ofs = register_stride * i;
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
FloatRegister v1_ofs = as_FloatRegister(v1->encoding() + ofs);
FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
FloatRegister v3_ofs = as_FloatRegister(v3->encoding() + ofs);
FloatRegister v4_ofs = as_FloatRegister(v4->encoding() + ofs);
FloatRegister v5_ofs = as_FloatRegister(v5->encoding() + ofs);
ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1)));
rbit(v2_ofs, T16B, v2_ofs);
eor(v2_ofs, T16B, v0_ofs, v2_ofs); // bit-swapped data ^ bit-swapped state
rev64(Hprime, T16B, Hprime);
rbit(Hprime, T16B, Hprime);
ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
ghash_modmul(/*result*/v0_ofs, /*result_lo*/v5_ofs, /*result_hi*/v4_ofs, /*b*/v2_ofs,
Hprime, vzr, a1_xor_a0, p,
/*temps*/v1_ofs, v3_ofs, /* reuse b*/v2_ofs);
}
// Then we sum the results.
for (int i = 1; i < unrolls; i++) {
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + register_stride * i);
eor(v0, T16B, v0, v0_ofs);
}
sub(blocks, blocks, (unsigned char)unrolls);
// And finally bit-reverse the state back to big endian.
rev64(v0, T16B, v0);
rbit(v0, T16B, v0);
st1(v0, T16B, state);
}