src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp - toolchain/jdk/jdk21 - Git at Google

 /*
  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  *
  */

 #include "precompiled.hpp"

 #include "asm/assembler.hpp"
 #include "asm/assembler.inline.hpp"
 #include "macroAssembler_aarch64.hpp"
 #include "memory/resourceArea.hpp"
 #include "runtime/stubRoutines.hpp"

 void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) {
   Label L_doLast;

   ld1(v0, T16B, from); // get 16 bytes of input

   ld1(v5, T16B, post(key, 16));
   rev32(v5, T16B, v5);

   ld1(v1, v2, v3, v4, T16B, post(key, 64));
   rev32(v1, T16B, v1);
   rev32(v2, T16B, v2);
   rev32(v3, T16B, v3);
   rev32(v4, T16B, v4);
   aesd(v0, v1);
   aesimc(v0, v0);
   aesd(v0, v2);
   aesimc(v0, v0);
   aesd(v0, v3);
   aesimc(v0, v0);
   aesd(v0, v4);
   aesimc(v0, v0);

   ld1(v1, v2, v3, v4, T16B, post(key, 64));
   rev32(v1, T16B, v1);
   rev32(v2, T16B, v2);
   rev32(v3, T16B, v3);
   rev32(v4, T16B, v4);
   aesd(v0, v1);
   aesimc(v0, v0);
   aesd(v0, v2);
   aesimc(v0, v0);
   aesd(v0, v3);
   aesimc(v0, v0);
   aesd(v0, v4);
   aesimc(v0, v0);

   ld1(v1, v2, T16B, post(key, 32));
   rev32(v1, T16B, v1);
   rev32(v2, T16B, v2);

   cmpw(keylen, 44);
   br(Assembler::EQ, L_doLast);

   aesd(v0, v1);
   aesimc(v0, v0);
   aesd(v0, v2);
   aesimc(v0, v0);

   ld1(v1, v2, T16B, post(key, 32));
   rev32(v1, T16B, v1);
   rev32(v2, T16B, v2);

   cmpw(keylen, 52);
   br(Assembler::EQ, L_doLast);

   aesd(v0, v1);
   aesimc(v0, v0);
   aesd(v0, v2);
   aesimc(v0, v0);

   ld1(v1, v2, T16B, post(key, 32));
   rev32(v1, T16B, v1);
   rev32(v2, T16B, v2);

   bind(L_doLast);

   aesd(v0, v1);
   aesimc(v0, v0);
   aesd(v0, v2);

   eor(v0, T16B, v0, v5);

   st1(v0, T16B, to);

   // Preserve the address of the start of the key
   sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
 }

 // Load expanded key into v17..v31
 void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) {
   Label L_loadkeys_44, L_loadkeys_52;
   cmpw(keylen, 52);
   br(Assembler::LO, L_loadkeys_44);
   br(Assembler::EQ, L_loadkeys_52);

   ld1(v17, v18,  T16B,  post(key, 32));
   rev32(v17,  T16B, v17);
   rev32(v18,  T16B, v18);
   bind(L_loadkeys_52);
   ld1(v19, v20,  T16B,  post(key, 32));
   rev32(v19,  T16B, v19);
   rev32(v20,  T16B, v20);
   bind(L_loadkeys_44);
   ld1(v21, v22, v23, v24,  T16B,  post(key, 64));
   rev32(v21,  T16B, v21);
   rev32(v22,  T16B, v22);
   rev32(v23,  T16B, v23);
   rev32(v24,  T16B, v24);
   ld1(v25, v26, v27, v28,  T16B,  post(key, 64));
   rev32(v25,  T16B, v25);
   rev32(v26,  T16B, v26);
   rev32(v27,  T16B, v27);
   rev32(v28,  T16B, v28);
   ld1(v29, v30, v31,  T16B, post(key, 48));
   rev32(v29,  T16B, v29);
   rev32(v30,  T16B, v30);
   rev32(v31,  T16B, v31);

   // Preserve the address of the start of the key
   sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
 }

 // NeoverseTM N1Software Optimization Guide:
 // Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC
 // instruction pairs will exhibit the performance characteristics
 // described in Section 4.6.
 void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) {
   aese(input, subkey); aesmc(input, input);
 }

 // KernelGenerator
 //
 // The abstract base class of an unrolled function generator.
 // Subclasses override generate(), length(), and next() to generate
 // unrolled and interleaved functions.
 //
 // The core idea is that a subclass defines a method which generates
 // the base case of a function and a method to generate a clone of it,
 // shifted to a different set of registers. KernelGenerator will then
 // generate several interleaved copies of the function, with each one
 // using a different set of registers.

 // The subclass must implement three methods: length(), which is the
 // number of instruction bundles in the intrinsic, generate(int n)
 // which emits the nth instruction bundle in the intrinsic, and next()
 // which takes an instance of the generator and returns a version of it,
 // shifted to a new set of registers.

 class KernelGenerator: public MacroAssembler {
 protected:
   const int _unrolls;
 public:
   KernelGenerator(Assembler *as, int unrolls)
     : MacroAssembler(as->code()), _unrolls(unrolls) { }
   virtual void generate(int index) = 0;
   virtual int length() = 0;
   virtual KernelGenerator *next() = 0;
   int unrolls() { return _unrolls; }
   void unroll();
 };

 void KernelGenerator::unroll() {
   ResourceMark rm;
   KernelGenerator **generators
     = NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls());

   generators[0] = this;
   for (int i = 1; i < unrolls(); i++) {
     generators[i] = generators[i-1]->next();
   }

   for (int j = 0; j < length(); j++) {
     for (int i = 0; i < unrolls(); i++) {
       generators[i]->generate(j);
     }
   }
 }

 // An unrolled and interleaved generator for AES encryption.
 class AESKernelGenerator: public KernelGenerator {
   Register _from, _to;
   const Register _keylen;
   FloatRegister _data;
   const FloatRegister _subkeys;
   bool _once;
   Label _rounds_44, _rounds_52;

 public:
   AESKernelGenerator(Assembler *as, int unrolls,
                      Register from, Register to, Register keylen, FloatRegister data,
                      FloatRegister subkeys, bool once = true)
     : KernelGenerator(as, unrolls),
       _from(from), _to(to), _keylen(keylen), _data(data),
       _subkeys(subkeys), _once(once) {
   }

   virtual void generate(int index) {
     switch (index) {
     case  0:
       if (_from != noreg) {
         ld1(_data, T16B, _from); // get 16 bytes of input
       }
       break;
     case  1:
       if (_once) {
         cmpw(_keylen, 52);
         br(Assembler::LO, _rounds_44);
         br(Assembler::EQ, _rounds_52);
       }
       break;
     case  2:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 0));  break;
     case  3:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 1));  break;
     case  4:
       if (_once)  bind(_rounds_52);
       break;
     case  5:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 2));  break;
     case  6:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 3));  break;
     case  7:
       if (_once)  bind(_rounds_44);
       break;
     case  8:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  4));  break;
     case  9:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  5));  break;
     case 10:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  6));  break;
     case 11:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  7));  break;
     case 12:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  8));  break;
     case 13:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  9));  break;
     case 14:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 10));  break;
     case 15:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 11));  break;
     case 16:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 12));  break;
     case 17:  aese(_data, as_FloatRegister(_subkeys->encoding() + 13));  break;
     case 18:  eor(_data, T16B, _data, as_FloatRegister(_subkeys->encoding() + 14));  break;
     case 19:
       if (_to != noreg) {
         st1(_data, T16B, _to);
       }
       break;
     default: ShouldNotReachHere();
     }
   }

   virtual KernelGenerator *next() {
     return new AESKernelGenerator(this, _unrolls,
                                   _from, _to, _keylen,
                                   _data->successor(), _subkeys, /*once*/false);
   }

   virtual int length() { return 20; }
 };

 // Uses expanded key in v17..v31
 // Returns encrypted values in inputs.
 // If to != noreg, store value at to; likewise from
 // Preserves key, keylen
 // Increments from, to
 // Input data in v0, v1, ...
 // unrolls controls the number of times to unroll the generated function
 void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen,
                                     FloatRegister data, int unrolls) {
   AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll();
 }

 // ghash_multiply and ghash_reduce are the non-unrolled versions of
 // the GHASH function generators.
 void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
                                      FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
                                      FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) {
   // Karatsuba multiplication performs a 128*128 -> 256-bit
   // multiplication in three 128-bit multiplications and a few
   // additions.
   //
   // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
   // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
   //
   // Inputs:
   //
   // A0 in a.d[0]     (subkey)
   // A1 in a.d[1]
   // (A1+A0) in a1_xor_a0.d[0]
   //
   // B0 in b.d[0]     (state)
   // B1 in b.d[1]

   ext(tmp1, T16B, b, b, 0x08);
   pmull2(result_hi, T1Q, b, a, T2D);  // A1*B1
   eor(tmp1, T16B, tmp1, b);           // (B1+B0)
   pmull(result_lo,  T1Q, b, a, T1D);  // A0*B0
   pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0)

   ext(tmp1, T16B, result_lo, result_hi, 0x08);
   eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0
   eor(tmp2, T16B, tmp2, tmp1);
   eor(tmp2, T16B, tmp2, tmp3);

   // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
   ins(result_hi, D, tmp2, 0, 1);
   ins(result_lo, D, tmp2, 1, 0);
 }

 void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
                   FloatRegister p, FloatRegister vzr, FloatRegister t1) {
   const FloatRegister t0 = result;

   // The GCM field polynomial f is z^128 + p(z), where p =
   // z^7+z^2+z+1.
   //
   //    z^128 === -p(z)  (mod (z^128 + p(z)))
   //
   // so, given that the product we're reducing is
   //    a == lo + hi * z^128
   // substituting,
   //      === lo - hi * p(z)  (mod (z^128 + p(z)))
   //
   // we reduce by multiplying hi by p(z) and subtracting the result
   // from (i.e. XORing it with) lo.  Because p has no nonzero high
   // bits we can do this with two 64-bit multiplications, lo*p and
   // hi*p.

   pmull2(t0, T1Q, hi, p, T2D);
   ext(t1, T16B, t0, vzr, 8);
   eor(hi, T16B, hi, t1);
   ext(t1, T16B, vzr, t0, 8);
   eor(lo, T16B, lo, t1);
   pmull(t0, T1Q, hi, p, T1D);
   eor(result, T16B, lo, t0);
 }

 class GHASHMultiplyGenerator: public KernelGenerator {
   FloatRegister _result_lo, _result_hi, _b,
     _a, _vzr, _a1_xor_a0, _p,
     _tmp1, _tmp2, _tmp3;

 public:
   GHASHMultiplyGenerator(Assembler *as, int unrolls,
                          /* offsetted registers */
                          FloatRegister result_lo, FloatRegister result_hi,
                          FloatRegister b,
                          /* non-offsetted (shared) registers */
                          FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr,
                          /* offsetted (temp) registers */
                          FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3)
     : KernelGenerator(as, unrolls),
       _result_lo(result_lo), _result_hi(result_hi), _b(b),
       _a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p),
       _tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { }

   int register_stride = 7;

   virtual void generate(int index) {
     // Karatsuba multiplication performs a 128*128 -> 256-bit
     // multiplication in three 128-bit multiplications and a few
     // additions.
     //
     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
     //
     // Inputs:
     //
     // A0 in a.d[0]     (subkey)
     // A1 in a.d[1]
     // (A1+A0) in a1_xor_a0.d[0]
     //
     // B0 in b.d[0]     (state)
     // B1 in b.d[1]

     switch (index) {
       case  0:  ext(_tmp1, T16B, _b, _b, 0x08);  break;
       case  1:  pmull2(_result_hi, T1Q, _b, _a, T2D);  // A1*B1
         break;
       case  2:  eor(_tmp1, T16B, _tmp1, _b);           // (B1+B0)
         break;
       case  3:  pmull(_result_lo,  T1Q, _b, _a, T1D);  // A0*B0
         break;
       case  4:  pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0)
         break;

       case  5:  ext(_tmp1, T16B, _result_lo, _result_hi, 0x08);  break;
       case  6:  eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0
         break;
       case  7:  eor(_tmp2, T16B, _tmp2, _tmp1);  break;
       case  8:  eor(_tmp2, T16B, _tmp2, _tmp3);  break;

         // Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication
       case  9:  ins(_result_hi, D, _tmp2, 0, 1);  break;
       case 10:  ins(_result_lo, D, _tmp2, 1, 0);  break;
       default: ShouldNotReachHere();
     }
   }

   virtual KernelGenerator* next() {
     GHASHMultiplyGenerator* result = new GHASHMultiplyGenerator(*this);
     result->_result_lo = as_FloatRegister(result->_result_lo->encoding() + register_stride);
     result->_result_hi = as_FloatRegister(result->_result_hi->encoding() + register_stride);
     result->_b         = as_FloatRegister(result->_b        ->encoding() + register_stride);
     result->_tmp1      = as_FloatRegister(result->_tmp1     ->encoding() + register_stride);
     result->_tmp2      = as_FloatRegister(result->_tmp2     ->encoding() + register_stride);
     result->_tmp3      = as_FloatRegister(result->_tmp3     ->encoding() + register_stride);
     return result;
   }

   virtual int length() { return 11; }
 };

 // Reduce the 128-bit product in hi:lo by the GCM field polynomial.
 // The FloatRegister argument called data is optional: if it is a
 // valid register, we interleave LD1 instructions with the
 // reduction. This is to reduce latency next time around the loop.
 class GHASHReduceGenerator: public KernelGenerator {
   FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1;
   int _once;
 public:
   GHASHReduceGenerator(Assembler *as, int unrolls,
                        /* offsetted registers */
                        FloatRegister result, FloatRegister lo, FloatRegister hi,
                        /* non-offsetted (shared) registers */
                        FloatRegister p, FloatRegister vzr, FloatRegister data,
                        /* offsetted (temp) registers */
                        FloatRegister t1)
     : KernelGenerator(as, unrolls),
       _result(result), _lo(lo), _hi(hi),
       _p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { }

   int register_stride = 7;

   virtual void generate(int index) {
     const FloatRegister t0 = _result;

     switch (index) {
       // The GCM field polynomial f is z^128 + p(z), where p =
       // z^7+z^2+z+1.
       //
       //    z^128 === -p(z)  (mod (z^128 + p(z)))
       //
       // so, given that the product we're reducing is
       //    a == lo + hi * z^128
       // substituting,
       //      === lo - hi * p(z)  (mod (z^128 + p(z)))
       //
       // we reduce by multiplying hi by p(z) and subtracting the _result
       // from (i.e. XORing it with) lo.  Because p has no nonzero high
       // bits we can do this with two 64-bit multiplications, lo*p and
       // hi*p.

       case  0:  pmull2(t0, T1Q, _hi, _p, T2D);  break;
       case  1:  ext(_t1, T16B, t0, _vzr, 8);  break;
       case  2:  eor(_hi, T16B, _hi, _t1);  break;
       case  3:  ext(_t1, T16B, _vzr, t0, 8);  break;
       case  4:  eor(_lo, T16B, _lo, _t1);  break;
       case  5:  pmull(t0, T1Q, _hi, _p, T1D);  break;
       case  6:  eor(_result, T16B, _lo, t0);  break;
       default: ShouldNotReachHere();
     }

     // Sprinkle load instructions into the generated instructions
     if (_data->is_valid() && _once) {
       assert(length() >= unrolls(), "not enough room for inteleaved loads");
       if (index < unrolls()) {
         ld1(as_FloatRegister(_data->encoding() + index*register_stride), T16B, post(r2, 0x10));
       }
     }
   }

   virtual KernelGenerator *next() {
     GHASHReduceGenerator *result = new GHASHReduceGenerator(*this);
     result->_result = as_FloatRegister(result->_result->encoding() + register_stride);
     result->_hi     = as_FloatRegister(result->_hi    ->encoding() + register_stride);
     result->_lo     = as_FloatRegister(result->_lo    ->encoding() + register_stride);
     result->_t1     = as_FloatRegister(result->_t1    ->encoding() + register_stride);
     result->_once = false;
     return result;
   }

  int length() { return 7; }
 };

 // Perform a GHASH multiply/reduce on a single FloatRegister.
 void MacroAssembler::ghash_modmul(FloatRegister result,
                                   FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
                                   FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
                                   FloatRegister t1, FloatRegister t2, FloatRegister t3) {
   ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3);
   ghash_reduce(result, result_lo, result_hi, p, vzr, t1);
 }

 // Interleaved GHASH processing.
 //
 // Clobbers all vector registers.
 //
 void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
                                               Register subkeyH,
                                               Register data, Register blocks, int unrolls) {
   int register_stride = 7;

   // Bafflingly, GCM uses little-endian for the byte order, but
   // big-endian for the bit order.  For example, the polynomial 1 is
   // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
   //
   // So, we must either reverse the bytes in each word and do
   // everything big-endian or reverse the bits in each byte and do
   // it little-endian.  On AArch64 it's more idiomatic to reverse
   // the bits in each byte (we have an instruction, RBIT, to do
   // that) and keep the data in little-endian bit order through the
   // calculation, bit-reversing the inputs and outputs.

   assert(unrolls * register_stride < 32, "out of registers");

   FloatRegister a1_xor_a0 = v28;
   FloatRegister Hprime = v29;
   FloatRegister vzr = v30;
   FloatRegister p = v31;
   eor(vzr, T16B, vzr, vzr); // zero register

   ldrq(p, field_polynomial);    // The field polynomial

   ldrq(v0, Address(state));
   ldrq(Hprime, Address(subkeyH));

   rev64(v0, T16B, v0);          // Bit-reverse words in state and subkeyH
   rbit(v0, T16B, v0);
   rev64(Hprime, T16B, Hprime);
   rbit(Hprime, T16B, Hprime);

   // Powers of H -> Hprime

   Label already_calculated, done;
   {
     // The first time around we'll have to calculate H**2, H**3, etc.
     // Look at the largest power of H in the subkeyH array to see if
     // it's already been calculated.
     ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1)));
     orr(rscratch1, rscratch1, rscratch2);
     cbnz(rscratch1, already_calculated);

     orr(v6, T16B, Hprime, Hprime);  // Start with H in v6 and Hprime
     for (int i = 1; i < unrolls; i++) {
       ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
       eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
       ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6,
                    Hprime, vzr, a1_xor_a0, p,
                    /*temps*/v1, v3, v2);
       rev64(v1, T16B, v6);
       rbit(v1, T16B, v1);
       strq(v1, Address(subkeyH, 16 * i));
     }
     b(done);
   }
   {
     bind(already_calculated);

     // Load the largest power of H we need into v6.
     ldrq(v6, Address(subkeyH, 16 * (unrolls - 1)));
     rev64(v6, T16B, v6);
     rbit(v6, T16B, v6);
   }
   bind(done);

   orr(Hprime, T16B, v6, v6);     // Move H ** unrolls into Hprime

   // Hprime contains (H ** 1, H ** 2, ... H ** unrolls)
   // v0 contains the initial state. Clear the others.
   for (int i = 1; i < unrolls; i++) {
     int ofs = register_stride * i;
     FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
     eor(v0_ofs, T16B, v0_ofs, v0_ofs); // zero each state register
   }

   ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
   eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))

   // Load #unrolls blocks of data
   for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
     FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
     ld1(v2_ofs, T16B, post(data, 0x10));
   }

   // Register assignments, replicated across 4 clones, v0 ... v23
   //
   // v0: input / output: current state, result of multiply/reduce
   // v1: temp
   // v2: input: one block of data (the ciphertext)
   //     also used as a temp once the data has been consumed
   // v3: temp
   // v4: output: high part of product
   // v5: output: low part ...
   // v6: unused
   //
   // Not replicated:
   //
   // v28: High part of H xor low part of H'
   // v29: H' (hash subkey)
   // v30: zero
   // v31: Reduction polynomial of the Galois field

   // Inner loop.
   // Do the whole load/add/multiply/reduce over all our data except
   // the last few rows.
   {
     Label L_ghash_loop;
     bind(L_ghash_loop);

     // Prefetching doesn't help here. In fact, on Neoverse N1 it's worse.
     // prfm(Address(data, 128), PLDL1KEEP);

     // Xor data into current state
     for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
       FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
       FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
       rbit(v2_ofs, T16B, v2_ofs);
       eor(v2_ofs, T16B, v0_ofs, v2_ofs);   // bit-swapped data ^ bit-swapped state
     }

     // Generate fully-unrolled multiply-reduce in two stages.

     GHASHMultiplyGenerator(this, unrolls,
                            /*result_lo*/v5, /*result_hi*/v4, /*data*/v2,
                            Hprime, a1_xor_a0, p, vzr,
                            /*temps*/v1, v3, /* reuse b*/v2) .unroll();

     // NB: GHASHReduceGenerator also loads the next #unrolls blocks of
     // data into v0, v0+ofs, the current state.
     GHASHReduceGenerator (this, unrolls,
                           /*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr,
                           /*data*/v2, /*temp*/v3) .unroll();

     sub(blocks, blocks, unrolls);
     cmp(blocks, (unsigned char)(unrolls * 2));
     br(GE, L_ghash_loop);
   }

   // Merge the #unrolls states.  Note that the data for the next
   // iteration has already been loaded into v4, v4+ofs, etc...

   // First, we multiply/reduce each clone by the appropriate power of H.
   for (int i = 0; i < unrolls; i++) {
     int ofs = register_stride * i;
     FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
     FloatRegister v1_ofs = as_FloatRegister(v1->encoding() + ofs);
     FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
     FloatRegister v3_ofs = as_FloatRegister(v3->encoding() + ofs);
     FloatRegister v4_ofs = as_FloatRegister(v4->encoding() + ofs);
     FloatRegister v5_ofs = as_FloatRegister(v5->encoding() + ofs);

     ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1)));

     rbit(v2_ofs, T16B, v2_ofs);
     eor(v2_ofs, T16B, v0_ofs, v2_ofs);   // bit-swapped data ^ bit-swapped state

     rev64(Hprime, T16B, Hprime);
     rbit(Hprime, T16B, Hprime);
     ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
     eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
     ghash_modmul(/*result*/v0_ofs, /*result_lo*/v5_ofs, /*result_hi*/v4_ofs, /*b*/v2_ofs,
                  Hprime, vzr, a1_xor_a0, p,
                  /*temps*/v1_ofs, v3_ofs, /* reuse b*/v2_ofs);
   }

   // Then we sum the results.
   for (int i = 1; i < unrolls; i++) {
     FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + register_stride * i);
     eor(v0, T16B, v0, v0_ofs);
   }

   sub(blocks, blocks, (unsigned char)unrolls);

   // And finally bit-reverse the state back to big endian.
   rev64(v0, T16B, v0);
   rbit(v0, T16B, v0);
   st1(v0, T16B, state);
 }