src/aarch64/macro-assembler-sve-aarch64.cc - platform/external/vixl - Git at Google

 // Copyright 2019, VIXL authors
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 //   * Redistributions of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //   * Redistributions in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //   * Neither the name of ARM Limited nor the names of its contributors may be
 //     used to endorse or promote products derived from this software without
 //     specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "macro-assembler-aarch64.h"

 namespace vixl {
 namespace aarch64 {

 void MacroAssembler::AddSubHelper(AddSubHelperOption option,
                                   const ZRegister& zd,
                                   const ZRegister& zn,
                                   IntegerOperand imm) {
   VIXL_ASSERT(imm.FitsInLane(zd));

   // Simple, encodable cases.
   if (TrySingleAddSub(option, zd, zn, imm)) return;

   VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));
   bool add_imm = (option == kAddImmediate);

   // Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one
   // instruction. Also interpret the immediate as signed, so we can convert
   // Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.
   IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));
   if (signed_imm.IsNegative()) {
     AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;
     IntegerOperand n_imm(signed_imm.GetMagnitude());
     // IntegerOperand can represent -INT_MIN, so this is always safe.
     VIXL_ASSERT(n_imm.IsPositiveOrZero());
     if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;
   }

   // Otherwise, fall back to dup + ADD_z_z/SUB_z_z.
   UseScratchRegisterScope temps(this);
   ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
   Dup(scratch, imm);

   SingleEmissionCheckScope guard(this);
   if (add_imm) {
     add(zd, zn, scratch);
   } else {
     sub(zd, zn, scratch);
   }
 }

 bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,
                                      const ZRegister& zd,
                                      const ZRegister& zn,
                                      IntegerOperand imm) {
   VIXL_ASSERT(imm.FitsInLane(zd));

   int imm8;
   int shift = -1;
   if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
       imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
     MovprfxHelperScope guard(this, zd, zn);
     switch (option) {
       case kAddImmediate:
         add(zd, zd, imm8, shift);
         return true;
       case kSubImmediate:
         sub(zd, zd, imm8, shift);
         return true;
     }
   }
   return false;
 }

 void MacroAssembler::IntWideImmHelper(IntArithImmFn imm_fn,
                                       SVEArithPredicatedFn reg_macro,
                                       const ZRegister& zd,
                                       const ZRegister& zn,
                                       IntegerOperand imm,
                                       bool is_signed) {
   if (is_signed) {
     // E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi
     if (imm.IsInt8()) {
       MovprfxHelperScope guard(this, zd, zn);
       (this->*imm_fn)(zd, zd, imm.AsInt8());
       return;
     }
   } else {
     // E.g. UMIN_z_zi, UMAX_z_zi
     if (imm.IsUint8()) {
       MovprfxHelperScope guard(this, zd, zn);
       (this->*imm_fn)(zd, zd, imm.AsUint8());
       return;
     }
   }

   UseScratchRegisterScope temps(this);
   PRegister pg = temps.AcquireGoverningP();
   Ptrue(pg.WithSameLaneSizeAs(zd));

   // Try to re-use zd if we can, so we can avoid a movprfx.
   ZRegister scratch =
       zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())
                      : zd;
   Dup(scratch, imm);

   // The vector-form macro for commutative operations will swap the arguments to
   // avoid movprfx, if necessary.
   (this->*reg_macro)(zd, pg.Merging(), zn, scratch);
 }

 void MacroAssembler::Mul(const ZRegister& zd,
                          const ZRegister& zn,
                          IntegerOperand imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   IntArithImmFn imm_fn = &Assembler::mul;
   SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;
   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
 }

 void MacroAssembler::Smin(const ZRegister& zd,
                           const ZRegister& zn,
                           IntegerOperand imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(imm.FitsInSignedLane(zd));
   IntArithImmFn imm_fn = &Assembler::smin;
   SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;
   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
 }

 void MacroAssembler::Smax(const ZRegister& zd,
                           const ZRegister& zn,
                           IntegerOperand imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(imm.FitsInSignedLane(zd));
   IntArithImmFn imm_fn = &Assembler::smax;
   SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;
   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
 }

 void MacroAssembler::Umax(const ZRegister& zd,
                           const ZRegister& zn,
                           IntegerOperand imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
   IntArithImmFn imm_fn = &Assembler::umax;
   SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;
   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
 }

 void MacroAssembler::Umin(const ZRegister& zd,
                           const ZRegister& zn,
                           IntegerOperand imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
   IntArithImmFn imm_fn = &Assembler::umin;
   SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;
   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
 }

 void MacroAssembler::Addpl(const Register& xd,
                            const Register& xn,
                            int64_t multiplier) {
   VIXL_ASSERT(allow_macro_instructions_);

   // This macro relies on `Rdvl` to handle some out-of-range cases. Check that
   // `VL * multiplier` cannot overflow, for any possible value of VL.
   VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
   VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));

   if (xd.IsZero()) return;
   if (xn.IsZero() && xd.IsSP()) {
     // TODO: This operation doesn't make much sense, but we could support it
     // with a scratch register if necessary.
     VIXL_UNIMPLEMENTED();
   }

   // Handling xzr requires an extra move, so defer it until later so we can try
   // to use `rdvl` instead (via `Addvl`).
   if (IsInt6(multiplier) && !xn.IsZero()) {
     SingleEmissionCheckScope guard(this);
     addpl(xd, xn, static_cast<int>(multiplier));
     return;
   }

   // If `multiplier` is a multiple of 8, we can use `Addvl` instead.
   if ((multiplier % kZRegBitsPerPRegBit) == 0) {
     Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);
     return;
   }

   if (IsInt6(multiplier)) {
     VIXL_ASSERT(xn.IsZero());  // Other cases were handled with `addpl`.
     // There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so
     // materialise a zero.
     MacroEmissionCheckScope guard(this);
     movz(xd, 0);
     addpl(xd, xd, static_cast<int>(multiplier));
     return;
   }

   // TODO: Some probable cases result in rather long sequences. For example,
   // `Addpl(sp, sp, 33)` requires five instructions, even though it's only just
   // outside the encodable range. We should look for ways to cover such cases
   // without drastically increasing the complexity of this logic.

   // For other cases, calculate xn + (PL * multiplier) using discrete
   // instructions. This requires two scratch registers in the general case, so
   // try to re-use the destination as a scratch register.
   UseScratchRegisterScope temps(this);
   temps.Include(xd);
   temps.Exclude(xn);

   Register scratch = temps.AcquireX();
   // Because there is no `rdpl`, so we have to calculate PL from VL. We can't
   // scale the multiplier because (we already know) it isn't a multiple of 8.
   Rdvl(scratch, multiplier);

   MacroEmissionCheckScope guard(this);
   if (xn.IsZero()) {
     asr(xd, scratch, kZRegBitsPerPRegBitLog2);
   } else if (xd.IsSP() || xn.IsSP()) {
     // TODO: MacroAssembler::Add should be able to handle this.
     asr(scratch, scratch, kZRegBitsPerPRegBitLog2);
     add(xd, xn, scratch);
   } else {
     add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));
   }
 }

 void MacroAssembler::Addvl(const Register& xd,
                            const Register& xn,
                            int64_t multiplier) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(xd.IsX());
   VIXL_ASSERT(xn.IsX());

   // Check that `VL * multiplier` cannot overflow, for any possible value of VL.
   VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
   VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));

   if (xd.IsZero()) return;
   if (xn.IsZero() && xd.IsSP()) {
     // TODO: This operation doesn't make much sense, but we could support it
     // with a scratch register if necessary. `rdvl` cannot write into `sp`.
     VIXL_UNIMPLEMENTED();
   }

   if (IsInt6(multiplier)) {
     SingleEmissionCheckScope guard(this);
     if (xn.IsZero()) {
       rdvl(xd, static_cast<int>(multiplier));
     } else {
       addvl(xd, xn, static_cast<int>(multiplier));
     }
     return;
   }

   // TODO: Some probable cases result in rather long sequences. For example,
   // `Addvl(sp, sp, 42)` requires four instructions, even though it's only just
   // outside the encodable range. We should look for ways to cover such cases
   // without drastically increasing the complexity of this logic.

   // For other cases, calculate xn + (VL * multiplier) using discrete
   // instructions. This requires two scratch registers in the general case, so
   // we try to re-use the destination as a scratch register.
   UseScratchRegisterScope temps(this);
   temps.Include(xd);
   temps.Exclude(xn);

   Register a = temps.AcquireX();
   Mov(a, multiplier);

   MacroEmissionCheckScope guard(this);
   Register b = temps.AcquireX();
   rdvl(b, 1);
   if (xn.IsZero()) {
     mul(xd, a, b);
   } else if (xd.IsSP() || xn.IsSP()) {
     mul(a, a, b);
     add(xd, xn, a);
   } else {
     madd(xd, a, b, xn);
   }
 }

 void MacroAssembler::CalculateSVEAddress(const Register& xd,
                                          const SVEMemOperand& addr,
                                          int vl_divisor_log2) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(!addr.IsScatterGather());
   VIXL_ASSERT(xd.IsX());

   // The lower bound is where a whole Z register is accessed.
   VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));
   // The upper bound is for P register accesses, and for instructions like
   // "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.
   VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));

   SVEOffsetModifier mod = addr.GetOffsetModifier();
   Register base = addr.GetScalarBase();

   if (addr.IsEquivalentToScalar()) {
     // For example:
     //   [x0]
     //   [x0, #0]
     //   [x0, xzr, LSL 2]
     Mov(xd, base);
   } else if (addr.IsScalarPlusImmediate()) {
     // For example:
     //   [x0, #42]
     //   [x0, #42, MUL VL]
     int64_t offset = addr.GetImmediateOffset();
     VIXL_ASSERT(offset != 0);  // Handled by IsEquivalentToScalar.
     if (addr.IsMulVl()) {
       int vl_divisor = 1 << vl_divisor_log2;
       // For all possible values of vl_divisor, we can simply use `Addpl`. This
       // will select `addvl` if necessary.
       VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);
       Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));
     } else {
       // IsScalarPlusImmediate() ensures that no other modifiers can occur.
       VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
       Add(xd, base, offset);
     }
   } else if (addr.IsScalarPlusScalar()) {
     // For example:
     //   [x0, x1]
     //   [x0, x1, LSL #4]
     Register offset = addr.GetScalarOffset();
     VIXL_ASSERT(!offset.IsZero());  // Handled by IsEquivalentToScalar.
     if (mod == SVE_LSL) {
       Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));
     } else {
       // IsScalarPlusScalar() ensures that no other modifiers can occur.
       VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
       Add(xd, base, offset);
     }
   } else {
     // All other forms are scatter-gather addresses, which cannot be evaluated
     // into an X register.
     VIXL_UNREACHABLE();
   }
 }

 void MacroAssembler::Cpy(const ZRegister& zd,
                          const PRegister& pg,
                          IntegerOperand imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(imm.FitsInLane(zd));
   int imm8;
   int shift;
   if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
       imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
     SingleEmissionCheckScope guard(this);
     cpy(zd, pg, imm8, shift);
     return;
   }

   // The fallbacks rely on `cpy` variants that only support merging predication.
   // If zeroing predication was requested, zero the destination first.
   if (pg.IsZeroing()) {
     SingleEmissionCheckScope guard(this);
     dup(zd, 0);
   }
   PRegisterM pg_m = pg.Merging();

   // Try to encode the immediate using fcpy.
   VIXL_ASSERT(imm.FitsInLane(zd));
   if (zd.GetLaneSizeInBits() >= kHRegSize) {
     double fp_imm = 0.0;
     switch (zd.GetLaneSizeInBits()) {
       case kHRegSize:
         fp_imm =
             FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);
         break;
       case kSRegSize:
         fp_imm = RawbitsToFloat(imm.AsUint32());
         break;
       case kDRegSize:
         fp_imm = RawbitsToDouble(imm.AsUint64());
         break;
       default:
         VIXL_UNREACHABLE();
         break;
     }
     // IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so
     // we can use IsImmFP64 for all lane sizes.
     if (IsImmFP64(fp_imm)) {
       SingleEmissionCheckScope guard(this);
       fcpy(zd, pg_m, fp_imm);
       return;
     }
   }

   // Fall back to using a scratch register.
   UseScratchRegisterScope temps(this);
   Register scratch = temps.AcquireRegisterToHoldLane(zd);
   Mov(scratch, imm);

   SingleEmissionCheckScope guard(this);
   cpy(zd, pg_m, scratch);
 }

 // TODO: We implement Fcpy (amongst other things) for all FP types because it
 // allows us to preserve user-specified NaNs. We should come up with some
 // FPImmediate type to abstract this, and avoid all the duplication below (and
 // elsewhere).

 void MacroAssembler::Fcpy(const ZRegister& zd,
                           const PRegisterM& pg,
                           double imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(pg.IsMerging());

   if (IsImmFP64(imm)) {
     SingleEmissionCheckScope guard(this);
     fcpy(zd, pg, imm);
     return;
   }

   // As a fall-back, cast the immediate to the required lane size, and try to
   // encode the bit pattern using `Cpy`.
   Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
 }

 void MacroAssembler::Fcpy(const ZRegister& zd,
                           const PRegisterM& pg,
                           float imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(pg.IsMerging());

   if (IsImmFP32(imm)) {
     SingleEmissionCheckScope guard(this);
     fcpy(zd, pg, imm);
     return;
   }

   // As a fall-back, cast the immediate to the required lane size, and try to
   // encode the bit pattern using `Cpy`.
   Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
 }

 void MacroAssembler::Fcpy(const ZRegister& zd,
                           const PRegisterM& pg,
                           Float16 imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(pg.IsMerging());

   if (IsImmFP16(imm)) {
     SingleEmissionCheckScope guard(this);
     fcpy(zd, pg, imm);
     return;
   }

   // As a fall-back, cast the immediate to the required lane size, and try to
   // encode the bit pattern using `Cpy`.
   Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
 }

 void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(imm.FitsInLane(zd));
   unsigned lane_size = zd.GetLaneSizeInBits();
   int imm8;
   int shift;
   if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
       imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
     SingleEmissionCheckScope guard(this);
     dup(zd, imm8, shift);
   } else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {
     SingleEmissionCheckScope guard(this);
     dupm(zd, imm.AsUintN(lane_size));
   } else {
     UseScratchRegisterScope temps(this);
     Register scratch = temps.AcquireRegisterToHoldLane(zd);
     Mov(scratch, imm);

     SingleEmissionCheckScope guard(this);
     dup(zd, scratch);
   }
 }

 void MacroAssembler::NoncommutativeArithmeticHelper(
     const ZRegister& zd,
     const PRegisterM& pg,
     const ZRegister& zn,
     const ZRegister& zm,
     SVEArithPredicatedFn fn,
     SVEArithPredicatedFn rev_fn) {
   if (zd.Aliases(zn)) {
     // E.g. zd = zd / zm
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zd, pg, zn, zm);
   } else if (zd.Aliases(zm)) {
     // E.g. zd = zn / zd
     SingleEmissionCheckScope guard(this);
     (this->*rev_fn)(zd, pg, zm, zn);
   } else {
     // E.g. zd = zn / zm
     MovprfxHelperScope guard(this, zd, pg, zn);
     (this->*fn)(zd, pg, zd, zm);
   }
 }

 void MacroAssembler::FPCommutativeArithmeticHelper(
     const ZRegister& zd,
     const PRegisterM& pg,
     const ZRegister& zn,
     const ZRegister& zm,
     SVEArithPredicatedFn fn,
     FPMacroNaNPropagationOption nan_option) {
   ResolveFPNaNPropagationOption(&nan_option);

   if (zd.Aliases(zn)) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zd, pg, zd, zm);
   } else if (zd.Aliases(zm)) {
     switch (nan_option) {
       case FastNaNPropagation: {
         // Swap the arguments.
         SingleEmissionCheckScope guard(this);
         (this->*fn)(zd, pg, zd, zn);
         return;
       }
       case StrictNaNPropagation: {
         UseScratchRegisterScope temps(this);
         // Use a scratch register to keep the argument order exactly as
         // specified.
         ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
         {
           MovprfxHelperScope guard(this, scratch, pg, zn);
           (this->*fn)(scratch, pg, scratch, zm);
         }
         Mov(zd, scratch);
         return;
       }
       case NoFPMacroNaNPropagationSelected:
         VIXL_UNREACHABLE();
         return;
     }
   } else {
     MovprfxHelperScope guard(this, zd, pg, zn);
     (this->*fn)(zd, pg, zd, zm);
   }
 }

 // Instructions of the form "inst zda, zn, zm, #num", where they are
 // non-commutative and no reversed form is provided.
 #define VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(V) \
   V(Cmla, cmla)                              \
   V(Sqrdcmlah, sqrdcmlah)

 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                     \
   void MacroAssembler::MASMFN(const ZRegister& zd,               \
                               const ZRegister& za,               \
                               const ZRegister& zn,               \
                               const ZRegister& zm,               \
                               int imm) {                         \
     if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
       UseScratchRegisterScope temps(this);                       \
       VIXL_ASSERT(AreSameLaneSize(zn, zm));                      \
       ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);  \
       Mov(ztmp, zd.Aliases(zn) ? zn : zm);                       \
       MovprfxHelperScope guard(this, zd, za);                    \
       ASMFN(zd,                                                  \
             (zd.Aliases(zn) ? ztmp : zn),                        \
             (zd.Aliases(zm) ? ztmp : zm),                        \
             imm);                                                \
     } else {                                                     \
       MovprfxHelperScope guard(this, zd, za);                    \
       ASMFN(zd, zn, zm, imm);                                    \
     }                                                            \
   }
 VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(VIXL_DEFINE_MASM_FUNC)
 #undef VIXL_DEFINE_MASM_FUNC

 // Instructions of the form "inst zda, zn, zm, #num, #num", where they are
 // non-commutative and no reversed form is provided.
 #define VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(V) \
   V(Cmla, cmla)                               \
   V(Sqrdcmlah, sqrdcmlah)

 // This doesn't handle zm when it's out of the range that can be encoded in
 // instruction. The range depends on element size: z0-z7 for H, z0-15 for S.
 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                     \
   void MacroAssembler::MASMFN(const ZRegister& zd,               \
                               const ZRegister& za,               \
                               const ZRegister& zn,               \
                               const ZRegister& zm,               \
                               int index,                         \
                               int rot) {                         \
     if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
       UseScratchRegisterScope temps(this);                       \
       ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);  \
       {                                                          \
         MovprfxHelperScope guard(this, ztmp, za);                \
         ASMFN(ztmp, zn, zm, index, rot);                         \
       }                                                          \
       Mov(zd, ztmp);                                             \
     } else {                                                     \
       MovprfxHelperScope guard(this, zd, za);                    \
       ASMFN(zd, zn, zm, index, rot);                             \
     }                                                            \
   }
 VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC)
 #undef VIXL_DEFINE_MASM_FUNC

 // Instructions of the form "inst zda, pg, zda, zn", where they are
 // non-commutative and no reversed form is provided.
 #define VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(V) \
   V(Addp, addp)                             \
   V(Bic, bic)                               \
   V(Faddp, faddp)                           \
   V(Fmaxnmp, fmaxnmp)                       \
   V(Fminnmp, fminnmp)                       \
   V(Fmaxp, fmaxp)                           \
   V(Fminp, fminp)                           \
   V(Fscale, fscale)                         \
   V(Smaxp, smaxp)                           \
   V(Sminp, sminp)                           \
   V(Suqadd, suqadd)                         \
   V(Umaxp, umaxp)                           \
   V(Uminp, uminp)                           \
   V(Usqadd, usqadd)

 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                       \
   void MacroAssembler::MASMFN(const ZRegister& zd,                 \
                               const PRegisterM& pg,                \
                               const ZRegister& zn,                 \
                               const ZRegister& zm) {               \
     VIXL_ASSERT(allow_macro_instructions_);                        \
     if (zd.Aliases(zm) && !zd.Aliases(zn)) {                       \
       UseScratchRegisterScope temps(this);                         \
       ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); \
       Mov(scratch, zm);                                            \
       MovprfxHelperScope guard(this, zd, pg, zn);                  \
       ASMFN(zd, pg, zd, scratch);                                  \
     } else {                                                       \
       MovprfxHelperScope guard(this, zd, pg, zn);                  \
       ASMFN(zd, pg, zd, zm);                                       \
     }                                                              \
   }
 VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
 #undef VIXL_DEFINE_MASM_FUNC

 // Instructions of the form "inst zda, pg, zda, zn", where they are
 // non-commutative and a reversed form is provided.
 #define VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(V) \
   V(Asr, asr)                                       \
   V(Fdiv, fdiv)                                     \
   V(Fsub, fsub)                                     \
   V(Lsl, lsl)                                       \
   V(Lsr, lsr)                                       \
   V(Sdiv, sdiv)                                     \
   V(Shsub, shsub)                                   \
   V(Sqrshl, sqrshl)                                 \
   V(Sqshl, sqshl)                                   \
   V(Sqsub, sqsub)                                   \
   V(Srshl, srshl)                                   \
   V(Sub, sub)                                       \
   V(Udiv, udiv)                                     \
   V(Uhsub, uhsub)                                   \
   V(Uqrshl, uqrshl)                                 \
   V(Uqshl, uqshl)                                   \
   V(Uqsub, uqsub)                                   \
   V(Urshl, urshl)

 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                          \
   void MacroAssembler::MASMFN(const ZRegister& zd,                    \
                               const PRegisterM& pg,                   \
                               const ZRegister& zn,                    \
                               const ZRegister& zm) {                  \
     VIXL_ASSERT(allow_macro_instructions_);                           \
     NoncommutativeArithmeticHelper(zd,                                \
                                    pg,                                \
                                    zn,                                \
                                    zm,                                \
                                    static_cast<SVEArithPredicatedFn>( \
                                        &Assembler::ASMFN),            \
                                    static_cast<SVEArithPredicatedFn>( \
                                        &Assembler::ASMFN##r));        \
   }
 VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
 #undef VIXL_DEFINE_MASM_FUNC

 void MacroAssembler::Fadd(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPCommutativeArithmeticHelper(zd,
                                 pg,
                                 zn,
                                 zm,
                                 static_cast<SVEArithPredicatedFn>(
                                     &Assembler::fadd),
                                 nan_option);
 }

 void MacroAssembler::Fabd(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPCommutativeArithmeticHelper(zd,
                                 pg,
                                 zn,
                                 zm,
                                 static_cast<SVEArithPredicatedFn>(
                                     &Assembler::fabd),
                                 nan_option);
 }

 void MacroAssembler::Fmul(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPCommutativeArithmeticHelper(zd,
                                 pg,
                                 zn,
                                 zm,
                                 static_cast<SVEArithPredicatedFn>(
                                     &Assembler::fmul),
                                 nan_option);
 }

 void MacroAssembler::Fmulx(const ZRegister& zd,
                            const PRegisterM& pg,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPCommutativeArithmeticHelper(zd,
                                 pg,
                                 zn,
                                 zm,
                                 static_cast<SVEArithPredicatedFn>(
                                     &Assembler::fmulx),
                                 nan_option);
 }

 void MacroAssembler::Fmax(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPCommutativeArithmeticHelper(zd,
                                 pg,
                                 zn,
                                 zm,
                                 static_cast<SVEArithPredicatedFn>(
                                     &Assembler::fmax),
                                 nan_option);
 }

 void MacroAssembler::Fmin(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPCommutativeArithmeticHelper(zd,
                                 pg,
                                 zn,
                                 zm,
                                 static_cast<SVEArithPredicatedFn>(
                                     &Assembler::fmin),
                                 nan_option);
 }

 void MacroAssembler::Fmaxnm(const ZRegister& zd,
                             const PRegisterM& pg,
                             const ZRegister& zn,
                             const ZRegister& zm,
                             FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPCommutativeArithmeticHelper(zd,
                                 pg,
                                 zn,
                                 zm,
                                 static_cast<SVEArithPredicatedFn>(
                                     &Assembler::fmaxnm),
                                 nan_option);
 }

 void MacroAssembler::Fminnm(const ZRegister& zd,
                             const PRegisterM& pg,
                             const ZRegister& zn,
                             const ZRegister& zm,
                             FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPCommutativeArithmeticHelper(zd,
                                 pg,
                                 zn,
                                 zm,
                                 static_cast<SVEArithPredicatedFn>(
                                     &Assembler::fminnm),
                                 nan_option);
 }

 void MacroAssembler::Fdup(const ZRegister& zd, double imm) {
   VIXL_ASSERT(allow_macro_instructions_);

   switch (zd.GetLaneSizeInBits()) {
     case kHRegSize:
       Fdup(zd, Float16(imm));
       break;
     case kSRegSize:
       Fdup(zd, static_cast<float>(imm));
       break;
     case kDRegSize:
       uint64_t bits = DoubleToRawbits(imm);
       if (IsImmFP64(bits)) {
         SingleEmissionCheckScope guard(this);
         fdup(zd, imm);
       } else {
         Dup(zd, bits);
       }
       break;
   }
 }

 void MacroAssembler::Fdup(const ZRegister& zd, float imm) {
   VIXL_ASSERT(allow_macro_instructions_);

   switch (zd.GetLaneSizeInBits()) {
     case kHRegSize:
       Fdup(zd, Float16(imm));
       break;
     case kSRegSize:
       if (IsImmFP32(imm)) {
         SingleEmissionCheckScope guard(this);
         fdup(zd, imm);
       } else {
         Dup(zd, FloatToRawbits(imm));
       }
       break;
     case kDRegSize:
       Fdup(zd, static_cast<double>(imm));
       break;
   }
 }

 void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {
   VIXL_ASSERT(allow_macro_instructions_);

   switch (zd.GetLaneSizeInBits()) {
     case kHRegSize:
       if (IsImmFP16(imm)) {
         SingleEmissionCheckScope guard(this);
         fdup(zd, imm);
       } else {
         Dup(zd, Float16ToRawbits(imm));
       }
       break;
     case kSRegSize:
       Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));
       break;
     case kDRegSize:
       Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));
       break;
   }
 }

 void MacroAssembler::Index(const ZRegister& zd,
                            const Operand& start,
                            const Operand& step) {
   class IndexOperand : public Operand {
    public:
     static IndexOperand Prepare(MacroAssembler* masm,
                                 UseScratchRegisterScope* temps,
                                 const Operand& op,
                                 const ZRegister& zd_inner) {
       // Look for encodable immediates.
       int imm;
       if (op.IsImmediate()) {
         if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd_inner, &imm)) {
           return IndexOperand(imm);
         }
         Register scratch = temps->AcquireRegisterToHoldLane(zd_inner);
         masm->Mov(scratch, op);
         return IndexOperand(scratch);
       } else {
         // Plain registers can be encoded directly.
         VIXL_ASSERT(op.IsPlainRegister());
         return IndexOperand(op.GetRegister());
       }
     }

     int GetImm5() const {
       int64_t imm = GetImmediate();
       VIXL_ASSERT(IsInt5(imm));
       return static_cast<int>(imm);
     }

    private:
     explicit IndexOperand(const Register& reg) : Operand(reg) {}
     explicit IndexOperand(int64_t imm) : Operand(imm) {}
   };

   UseScratchRegisterScope temps(this);
   IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);
   IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);

   SingleEmissionCheckScope guard(this);
   if (start_enc.IsImmediate()) {
     if (step_enc.IsImmediate()) {
       index(zd, start_enc.GetImm5(), step_enc.GetImm5());
     } else {
       index(zd, start_enc.GetImm5(), step_enc.GetRegister());
     }
   } else {
     if (step_enc.IsImmediate()) {
       index(zd, start_enc.GetRegister(), step_enc.GetImm5());
     } else {
       index(zd, start_enc.GetRegister(), step_enc.GetRegister());
     }
   }
 }

 void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(imm.FitsInLane(zdn));

   if (imm.IsZero()) {
     SingleEmissionCheckScope guard(this);
     insr(zdn, xzr);
     return;
   }

   UseScratchRegisterScope temps(this);
   Register scratch = temps.AcquireRegisterToHoldLane(zdn);

   // TODO: There are many cases where we could optimise immediates, such as by
   // detecting repeating patterns or FP immediates. We should optimise and
   // abstract this for use in other SVE mov-immediate-like macros.
   Mov(scratch, imm);

   SingleEmissionCheckScope guard(this);
   insr(zdn, scratch);
 }

 void MacroAssembler::Mla(const ZRegister& zd,
                          const PRegisterM& pg,
                          const ZRegister& za,
                          const ZRegister& zn,
                          const ZRegister& zm) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (zd.Aliases(za)) {
     // zda = zda + (zn * zm)
     SingleEmissionCheckScope guard(this);
     mla(zd, pg, zn, zm);
   } else if (zd.Aliases(zn)) {
     // zdn = za + (zdn * zm)
     SingleEmissionCheckScope guard(this);
     mad(zd, pg, zm, za);
   } else if (zd.Aliases(zm)) {
     // Multiplication is commutative, so we can swap zn and zm.
     // zdm = za + (zdm * zn)
     SingleEmissionCheckScope guard(this);
     mad(zd, pg, zn, za);
   } else {
     // zd = za + (zn * zm)
     ExactAssemblyScope guard(this, 2 * kInstructionSize);
     movprfx(zd, pg, za);
     mla(zd, pg, zn, zm);
   }
 }

 void MacroAssembler::Mls(const ZRegister& zd,
                          const PRegisterM& pg,
                          const ZRegister& za,
                          const ZRegister& zn,
                          const ZRegister& zm) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (zd.Aliases(za)) {
     // zda = zda - (zn * zm)
     SingleEmissionCheckScope guard(this);
     mls(zd, pg, zn, zm);
   } else if (zd.Aliases(zn)) {
     // zdn = za - (zdn * zm)
     SingleEmissionCheckScope guard(this);
     msb(zd, pg, zm, za);
   } else if (zd.Aliases(zm)) {
     // Multiplication is commutative, so we can swap zn and zm.
     // zdm = za - (zdm * zn)
     SingleEmissionCheckScope guard(this);
     msb(zd, pg, zn, za);
   } else {
     // zd = za - (zn * zm)
     ExactAssemblyScope guard(this, 2 * kInstructionSize);
     movprfx(zd, pg, za);
     mls(zd, pg, zn, zm);
   }
 }

 void MacroAssembler::CompareHelper(Condition cond,
                                    const PRegisterWithLaneSize& pd,
                                    const PRegisterZ& pg,
                                    const ZRegister& zn,
                                    IntegerOperand imm) {
   UseScratchRegisterScope temps(this);
   ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
   Dup(zm, imm);
   SingleEmissionCheckScope guard(this);
   cmp(cond, pd, pg, zn, zm);
 }

 void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,
                             const PRegister& pg,
                             const PRegisterWithLaneSize& pn) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(pd.IsLaneSizeB());
   VIXL_ASSERT(pn.IsLaneSizeB());
   if (pd.Is(pn)) {
     SingleEmissionCheckScope guard(this);
     pfirst(pd, pg, pn);
   } else {
     UseScratchRegisterScope temps(this);
     PRegister temp_pg = pg;
     if (pd.Aliases(pg)) {
       temp_pg = temps.AcquireP();
       Mov(temp_pg.VnB(), pg.VnB());
     }
     Mov(pd, pn);
     SingleEmissionCheckScope guard(this);
     pfirst(pd, temp_pg, pd);
   }
 }

 void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,
                            const PRegister& pg,
                            const PRegisterWithLaneSize& pn) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(AreSameFormat(pd, pn));
   if (pd.Is(pn)) {
     SingleEmissionCheckScope guard(this);
     pnext(pd, pg, pn);
   } else {
     UseScratchRegisterScope temps(this);
     PRegister temp_pg = pg;
     if (pd.Aliases(pg)) {
       temp_pg = temps.AcquireP();
       Mov(temp_pg.VnB(), pg.VnB());
     }
     Mov(pd.VnB(), pn.VnB());
     SingleEmissionCheckScope guard(this);
     pnext(pd, temp_pg, pd);
   }
 }

 void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,
                            SVEPredicateConstraint pattern,
                            FlagsUpdate s) {
   VIXL_ASSERT(allow_macro_instructions_);
   switch (s) {
     case LeaveFlags:
       Ptrue(pd, pattern);
       return;
     case SetFlags:
       Ptrues(pd, pattern);
       return;
   }
   VIXL_UNREACHABLE();
 }

 void MacroAssembler::Sub(const ZRegister& zd,
                          IntegerOperand imm,
                          const ZRegister& zm) {
   VIXL_ASSERT(allow_macro_instructions_);

   int imm8;
   int shift = -1;
   if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
       imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
     MovprfxHelperScope guard(this, zd, zm);
     subr(zd, zd, imm8, shift);
   } else {
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());
     Dup(scratch, imm);

     SingleEmissionCheckScope guard(this);
     sub(zd, scratch, zm);
   }
 }

 void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,
                                                const PRegisterZ& pg,
                                                const SVEMemOperand& addr,
                                                SVELoadBroadcastFn fn,
                                                int divisor) {
   VIXL_ASSERT(addr.IsScalarPlusImmediate());
   int64_t imm = addr.GetImmediateOffset();
   if ((imm % divisor == 0) && IsUint6(imm / divisor)) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, addr);
   } else {
     UseScratchRegisterScope temps(this);
     Register scratch = temps.AcquireX();
     CalculateSVEAddress(scratch, addr, zt);
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, SVEMemOperand(scratch));
   }
 }

 void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,
                                                  const SVEMemOperand& addr,
                                                  SVELoadStoreFn fn) {
   VIXL_ASSERT(allow_macro_instructions_);
   VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());

   if (addr.IsPlainScalar() ||
       (addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&
        addr.IsMulVl())) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(rt, addr);
     return;
   }

   if (addr.IsEquivalentToScalar()) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));
     return;
   }

   UseScratchRegisterScope temps(this);
   Register scratch = temps.AcquireX();
   CalculateSVEAddress(scratch, addr, rt);
   SingleEmissionCheckScope guard(this);
   (this->*fn)(rt, SVEMemOperand(scratch));
 }

 template <typename Tg, typename Tf>
 void MacroAssembler::SVELoadStoreNTBroadcastQOHelper(
     const ZRegister& zt,
     const Tg& pg,
     const SVEMemOperand& addr,
     Tf fn,
     int imm_bits,
     int shift_amount,
     SVEOffsetModifier supported_modifier,
     int vl_divisor_log2) {
   VIXL_ASSERT(allow_macro_instructions_);
   int imm_divisor = 1 << shift_amount;

   if (addr.IsPlainScalar() ||
       (addr.IsScalarPlusImmediate() &&
        IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&
        ((addr.GetImmediateOffset() % imm_divisor) == 0) &&
        (addr.GetOffsetModifier() == supported_modifier))) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, addr);
     return;
   }

   if (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
       addr.IsEquivalentToLSL(zt.GetLaneSizeInBytesLog2())) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, addr);
     return;
   }

   if (addr.IsEquivalentToScalar()) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
     return;
   }

   if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&
       (vl_divisor_log2 == -1)) {
     // We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL
     // dependent.
     VIXL_UNIMPLEMENTED();
   }

   UseScratchRegisterScope temps(this);
   Register scratch = temps.AcquireX();
   CalculateSVEAddress(scratch, addr, vl_divisor_log2);
   SingleEmissionCheckScope guard(this);
   (this->*fn)(zt, pg, SVEMemOperand(scratch));
 }

 template <typename Tg, typename Tf>
 void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,
                                          const ZRegister& zt,
                                          const Tg& pg,
                                          const SVEMemOperand& addr,
                                          Tf fn) {
   if (addr.IsPlainScalar() ||
       (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
        addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||
       (addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&
        addr.IsMulVl())) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, addr);
     return;
   }

   if (addr.IsEquivalentToScalar()) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
     return;
   }

   if (addr.IsVectorPlusImmediate()) {
     uint64_t offset = addr.GetImmediateOffset();
     if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&
         IsUint5(offset >> msize_in_bytes_log2)) {
       SingleEmissionCheckScope guard(this);
       (this->*fn)(zt, pg, addr);
       return;
     }
   }

   if (addr.IsScalarPlusVector()) {
     VIXL_ASSERT(addr.IsScatterGather());
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, addr);
     return;
   }

   UseScratchRegisterScope temps(this);
   if (addr.IsScatterGather()) {
     // In scatter-gather modes, zt and zn/zm have the same lane size. However,
     // for 32-bit accesses, the result of each lane's address calculation still
     // requires 64 bits; we can't naively use `Adr` for the address calculation
     // because it would truncate each address to 32 bits.

     if (addr.IsVectorPlusImmediate()) {
       // Synthesise the immediate in an X register, then use a
       // scalar-plus-vector access with the original vector.
       Register scratch = temps.AcquireX();
       Mov(scratch, addr.GetImmediateOffset());
       SingleEmissionCheckScope guard(this);
       SVEOffsetModifier om =
           zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;
       (this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));
       return;
     }

     VIXL_UNIMPLEMENTED();
   } else {
     Register scratch = temps.AcquireX();
     // TODO: If we have an immediate offset that is a multiple of
     // msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to
     // save an instruction.
     int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;
     CalculateSVEAddress(scratch, addr, vl_divisor_log2);
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, SVEMemOperand(scratch));
   }
 }

 template <typename Tf>
 void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,
                                      const ZRegister& zt,
                                      const PRegisterZ& pg,
                                      const SVEMemOperand& addr,
                                      Tf fn) {
   if (addr.IsScatterGather()) {
     // Scatter-gather first-fault loads share encodings with normal loads.
     SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);
     return;
   }

   // Contiguous first-faulting loads have no scalar-plus-immediate form at all,
   // so we don't do immediate synthesis.

   // We cannot currently distinguish "[x0]" from "[x0, #0]", and this
   // is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.
   if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&
                                addr.IsEquivalentToLSL(msize_in_bytes_log2))) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zt, pg, addr);
     return;
   }

   VIXL_UNIMPLEMENTED();
 }

 void MacroAssembler::Ld1b(const ZRegister& zt,
                           const PRegisterZ& pg,
                           const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kBRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVELoad1Fn>(&Assembler::ld1b));
 }

 void MacroAssembler::Ld1h(const ZRegister& zt,
                           const PRegisterZ& pg,
                           const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kHRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVELoad1Fn>(&Assembler::ld1h));
 }

 void MacroAssembler::Ld1w(const ZRegister& zt,
                           const PRegisterZ& pg,
                           const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kWRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVELoad1Fn>(&Assembler::ld1w));
 }

 void MacroAssembler::Ld1d(const ZRegister& zt,
                           const PRegisterZ& pg,
                           const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kDRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVELoad1Fn>(&Assembler::ld1d));
 }

 void MacroAssembler::Ld1sb(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kBRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVELoad1Fn>(&Assembler::ld1sb));
 }

 void MacroAssembler::Ld1sh(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kHRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVELoad1Fn>(&Assembler::ld1sh));
 }

 void MacroAssembler::Ld1sw(const ZRegister& zt,
                            const PRegisterZ& pg,
                            const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kSRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVELoad1Fn>(&Assembler::ld1sw));
 }

 void MacroAssembler::St1b(const ZRegister& zt,
                           const PRegister& pg,
                           const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kBRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVEStore1Fn>(&Assembler::st1b));
 }

 void MacroAssembler::St1h(const ZRegister& zt,
                           const PRegister& pg,
                           const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kHRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVEStore1Fn>(&Assembler::st1h));
 }

 void MacroAssembler::St1w(const ZRegister& zt,
                           const PRegister& pg,
                           const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kSRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVEStore1Fn>(&Assembler::st1w));
 }

 void MacroAssembler::St1d(const ZRegister& zt,
                           const PRegister& pg,
                           const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadStore1Helper(kDRegSizeInBytesLog2,
                       zt,
                       pg,
                       addr,
                       static_cast<SVEStore1Fn>(&Assembler::st1d));
 }

 void MacroAssembler::Ldff1b(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadFFHelper(kBRegSizeInBytesLog2,
                   zt,
                   pg,
                   addr,
                   static_cast<SVELoad1Fn>(&Assembler::ldff1b));
 }

 void MacroAssembler::Ldff1h(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadFFHelper(kHRegSizeInBytesLog2,
                   zt,
                   pg,
                   addr,
                   static_cast<SVELoad1Fn>(&Assembler::ldff1h));
 }

 void MacroAssembler::Ldff1w(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadFFHelper(kSRegSizeInBytesLog2,
                   zt,
                   pg,
                   addr,
                   static_cast<SVELoad1Fn>(&Assembler::ldff1w));
 }

 void MacroAssembler::Ldff1d(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadFFHelper(kDRegSizeInBytesLog2,
                   zt,
                   pg,
                   addr,
                   static_cast<SVELoad1Fn>(&Assembler::ldff1d));
 }

 void MacroAssembler::Ldff1sb(const ZRegister& zt,
                              const PRegisterZ& pg,
                              const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadFFHelper(kBRegSizeInBytesLog2,
                   zt,
                   pg,
                   addr,
                   static_cast<SVELoad1Fn>(&Assembler::ldff1sb));
 }

 void MacroAssembler::Ldff1sh(const ZRegister& zt,
                              const PRegisterZ& pg,
                              const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadFFHelper(kHRegSizeInBytesLog2,
                   zt,
                   pg,
                   addr,
                   static_cast<SVELoad1Fn>(&Assembler::ldff1sh));
 }

 void MacroAssembler::Ldff1sw(const ZRegister& zt,
                              const PRegisterZ& pg,
                              const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVELoadFFHelper(kSRegSizeInBytesLog2,
                   zt,
                   pg,
                   addr,
                   static_cast<SVELoad1Fn>(&Assembler::ldff1sw));
 }

 #define VIXL_SVE_LD1R_LIST(V) \
   V(qb, 4) V(qh, 4) V(qw, 4) V(qd, 4) V(ob, 5) V(oh, 5) V(ow, 5) V(od, 5)

 #define VIXL_DEFINE_MASM_FUNC(SZ, SH)                          \
   void MacroAssembler::Ld1r##SZ(const ZRegister& zt,           \
                                 const PRegisterZ& pg,          \
                                 const SVEMemOperand& addr) {   \
     VIXL_ASSERT(allow_macro_instructions_);                    \
     SVELoadStoreNTBroadcastQOHelper(zt,                        \
                                     pg,                        \
                                     addr,                      \
                                     &MacroAssembler::ld1r##SZ, \
                                     4,                         \
                                     SH,                        \
                                     NO_SVE_OFFSET_MODIFIER,    \
                                     -1);                       \
   }

 VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC)

 #undef VIXL_DEFINE_MASM_FUNC
 #undef VIXL_SVE_LD1R_LIST

 void MacroAssembler::Ldnt1b(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (addr.IsVectorPlusScalar()) {
     SingleEmissionCheckScope guard(this);
     ldnt1b(zt, pg, addr);
   } else {
     SVELoadStoreNTBroadcastQOHelper(zt,
                                     pg,
                                     addr,
                                     &MacroAssembler::ldnt1b,
                                     4,
                                     0,
                                     SVE_MUL_VL);
   }
 }

 void MacroAssembler::Ldnt1d(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (addr.IsVectorPlusScalar()) {
     SingleEmissionCheckScope guard(this);
     ldnt1d(zt, pg, addr);
   } else {
     SVELoadStoreNTBroadcastQOHelper(zt,
                                     pg,
                                     addr,
                                     &MacroAssembler::ldnt1d,
                                     4,
                                     0,
                                     SVE_MUL_VL);
   }
 }

 void MacroAssembler::Ldnt1h(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (addr.IsVectorPlusScalar()) {
     SingleEmissionCheckScope guard(this);
     ldnt1h(zt, pg, addr);
   } else {
     SVELoadStoreNTBroadcastQOHelper(zt,
                                     pg,
                                     addr,
                                     &MacroAssembler::ldnt1h,
                                     4,
                                     0,
                                     SVE_MUL_VL);
   }
 }

 void MacroAssembler::Ldnt1w(const ZRegister& zt,
                             const PRegisterZ& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (addr.IsVectorPlusScalar()) {
     SingleEmissionCheckScope guard(this);
     ldnt1w(zt, pg, addr);
   } else {
     SVELoadStoreNTBroadcastQOHelper(zt,
                                     pg,
                                     addr,
                                     &MacroAssembler::ldnt1w,
                                     4,
                                     0,
                                     SVE_MUL_VL);
   }
 }

 void MacroAssembler::Stnt1b(const ZRegister& zt,
                             const PRegister& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (addr.IsVectorPlusScalar()) {
     SingleEmissionCheckScope guard(this);
     stnt1b(zt, pg, addr);
   } else {
     SVELoadStoreNTBroadcastQOHelper(zt,
                                     pg,
                                     addr,
                                     &MacroAssembler::stnt1b,
                                     4,
                                     0,
                                     SVE_MUL_VL);
   }
 }
 void MacroAssembler::Stnt1d(const ZRegister& zt,
                             const PRegister& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (addr.IsVectorPlusScalar()) {
     SingleEmissionCheckScope guard(this);
     stnt1d(zt, pg, addr);
   } else {
     SVELoadStoreNTBroadcastQOHelper(zt,
                                     pg,
                                     addr,
                                     &MacroAssembler::stnt1d,
                                     4,
                                     0,
                                     SVE_MUL_VL);
   }
 }
 void MacroAssembler::Stnt1h(const ZRegister& zt,
                             const PRegister& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (addr.IsVectorPlusScalar()) {
     SingleEmissionCheckScope guard(this);
     stnt1h(zt, pg, addr);
   } else {
     SVELoadStoreNTBroadcastQOHelper(zt,
                                     pg,
                                     addr,
                                     &MacroAssembler::stnt1h,
                                     4,
                                     0,
                                     SVE_MUL_VL);
   }
 }
 void MacroAssembler::Stnt1w(const ZRegister& zt,
                             const PRegister& pg,
                             const SVEMemOperand& addr) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (addr.IsVectorPlusScalar()) {
     SingleEmissionCheckScope guard(this);
     stnt1w(zt, pg, addr);
   } else {
     SVELoadStoreNTBroadcastQOHelper(zt,
                                     pg,
                                     addr,
                                     &MacroAssembler::stnt1w,
                                     4,
                                     0,
                                     SVE_MUL_VL);
   }
 }

 void MacroAssembler::SVEDotIndexHelper(ZZZImmFn fn,
                                        const ZRegister& zd,
                                        const ZRegister& za,
                                        const ZRegister& zn,
                                        const ZRegister& zm,
                                        int index) {
   if (zd.Aliases(za)) {
     // zda = zda + (zn . zm)
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zd, zn, zm, index);

   } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
     // zdn = za + (zdn . zm[index])
     // zdm = za + (zn . zdm[index])
     // zdnm = za + (zdnm . zdnm[index])
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, scratch, za);
       (this->*fn)(scratch, zn, zm, index);
     }

     Mov(zd, scratch);
   } else {
     // zd = za + (zn . zm)
     MovprfxHelperScope guard(this, zd, za);
     (this->*fn)(zd, zn, zm, index);
   }
 }

 void MacroAssembler::FourRegDestructiveHelper(Int3ArithFn fn,
                                               const ZRegister& zd,
                                               const ZRegister& za,
                                               const ZRegister& zn,
                                               const ZRegister& zm) {
   if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
     // zd = za . zd . zm
     // zd = za . zn . zd
     // zd = za . zd . zd
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, scratch, za);
       (this->*fn)(scratch, zn, zm);
     }

     Mov(zd, scratch);
   } else {
     MovprfxHelperScope guard(this, zd, za);
     (this->*fn)(zd, zn, zm);
   }
 }

 void MacroAssembler::FourRegDestructiveHelper(Int4ArithFn fn,
                                               const ZRegister& zd,
                                               const ZRegister& za,
                                               const ZRegister& zn,
                                               const ZRegister& zm) {
   if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
     // zd = za . zd . zm
     // zd = za . zn . zd
     // zd = za . zd . zd
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, scratch, za);
       (this->*fn)(scratch, scratch, zn, zm);
     }

     Mov(zd, scratch);
   } else {
     MovprfxHelperScope guard(this, zd, za);
     (this->*fn)(zd, zd, zn, zm);
   }
 }

 void MacroAssembler::FourRegOneImmDestructiveHelper(ZZZImmFn fn,
                                                     const ZRegister& zd,
                                                     const ZRegister& za,
                                                     const ZRegister& zn,
                                                     const ZRegister& zm,
                                                     int imm) {
   if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
     // zd = za . zd . zm[i]
     // zd = za . zn . zd[i]
     // zd = za . zd . zd[i]
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, scratch, za);
       (this->*fn)(scratch, zn, zm, imm);
     }

     Mov(zd, scratch);
   } else {
     // zd = za . zn . zm[i]
     MovprfxHelperScope guard(this, zd, za);
     (this->*fn)(zd, zn, zm, imm);
   }
 }

 void MacroAssembler::AbsoluteDifferenceAccumulate(Int3ArithFn fn,
                                                   const ZRegister& zd,
                                                   const ZRegister& za,
                                                   const ZRegister& zn,
                                                   const ZRegister& zm) {
   if (zn.Aliases(zm)) {
     // If zn == zm, the difference is zero.
     if (!zd.Aliases(za)) {
       Mov(zd, za);
     }
   } else if (zd.Aliases(za)) {
     SingleEmissionCheckScope guard(this);
     (this->*fn)(zd, zn, zm);
   } else if (zd.Aliases(zn)) {
     UseScratchRegisterScope temps(this);
     ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
     Mov(ztmp, zn);
     MovprfxHelperScope guard(this, zd, za);
     (this->*fn)(zd, ztmp, zm);
   } else if (zd.Aliases(zm)) {
     UseScratchRegisterScope temps(this);
     ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
     Mov(ztmp, zm);
     MovprfxHelperScope guard(this, zd, za);
     (this->*fn)(zd, zn, ztmp);
   } else {
     MovprfxHelperScope guard(this, zd, za);
     (this->*fn)(zd, zn, zm);
   }
 }

 #define VIXL_SVE_4REG_LIST(V)                       \
   V(Saba, saba, AbsoluteDifferenceAccumulate)       \
   V(Uaba, uaba, AbsoluteDifferenceAccumulate)       \
   V(Sabalb, sabalb, AbsoluteDifferenceAccumulate)   \
   V(Sabalt, sabalt, AbsoluteDifferenceAccumulate)   \
   V(Uabalb, uabalb, AbsoluteDifferenceAccumulate)   \
   V(Uabalt, uabalt, AbsoluteDifferenceAccumulate)   \
   V(Sdot, sdot, FourRegDestructiveHelper)           \
   V(Udot, udot, FourRegDestructiveHelper)           \
   V(Adclb, adclb, FourRegDestructiveHelper)         \
   V(Adclt, adclt, FourRegDestructiveHelper)         \
   V(Sbclb, sbclb, FourRegDestructiveHelper)         \
   V(Sbclt, sbclt, FourRegDestructiveHelper)         \
   V(Smlalb, smlalb, FourRegDestructiveHelper)       \
   V(Smlalt, smlalt, FourRegDestructiveHelper)       \
   V(Smlslb, smlslb, FourRegDestructiveHelper)       \
   V(Smlslt, smlslt, FourRegDestructiveHelper)       \
   V(Umlalb, umlalb, FourRegDestructiveHelper)       \
   V(Umlalt, umlalt, FourRegDestructiveHelper)       \
   V(Umlslb, umlslb, FourRegDestructiveHelper)       \
   V(Umlslt, umlslt, FourRegDestructiveHelper)       \
   V(Bcax, bcax, FourRegDestructiveHelper)           \
   V(Bsl, bsl, FourRegDestructiveHelper)             \
   V(Bsl1n, bsl1n, FourRegDestructiveHelper)         \
   V(Bsl2n, bsl2n, FourRegDestructiveHelper)         \
   V(Eor3, eor3, FourRegDestructiveHelper)           \
   V(Nbsl, nbsl, FourRegDestructiveHelper)           \
   V(Fmlalb, fmlalb, FourRegDestructiveHelper)       \
   V(Fmlalt, fmlalt, FourRegDestructiveHelper)       \
   V(Fmlslb, fmlslb, FourRegDestructiveHelper)       \
   V(Fmlslt, fmlslt, FourRegDestructiveHelper)       \
   V(Sqdmlalb, sqdmlalb, FourRegDestructiveHelper)   \
   V(Sqdmlalbt, sqdmlalbt, FourRegDestructiveHelper) \
   V(Sqdmlalt, sqdmlalt, FourRegDestructiveHelper)   \
   V(Sqdmlslb, sqdmlslb, FourRegDestructiveHelper)   \
   V(Sqdmlslbt, sqdmlslbt, FourRegDestructiveHelper) \
   V(Sqdmlslt, sqdmlslt, FourRegDestructiveHelper)   \
   V(Sqrdmlah, sqrdmlah, FourRegDestructiveHelper)   \
   V(Sqrdmlsh, sqrdmlsh, FourRegDestructiveHelper)   \
   V(Fmmla, fmmla, FourRegDestructiveHelper)         \
   V(Smmla, smmla, FourRegDestructiveHelper)         \
   V(Ummla, ummla, FourRegDestructiveHelper)         \
   V(Usmmla, usmmla, FourRegDestructiveHelper)       \
   V(Usdot, usdot, FourRegDestructiveHelper)

 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
   void MacroAssembler::MASMFN(const ZRegister& zd,   \
                               const ZRegister& za,   \
                               const ZRegister& zn,   \
                               const ZRegister& zm) { \
     VIXL_ASSERT(allow_macro_instructions_);          \
     HELPER(&Assembler::ASMFN, zd, za, zn, zm);       \
   }
 VIXL_SVE_4REG_LIST(VIXL_DEFINE_MASM_FUNC)
 #undef VIXL_DEFINE_MASM_FUNC

 #define VIXL_SVE_4REG_1IMM_LIST(V)                      \
   V(Fmla, fmla, FourRegOneImmDestructiveHelper)         \
   V(Fmls, fmls, FourRegOneImmDestructiveHelper)         \
   V(Fmlalb, fmlalb, FourRegOneImmDestructiveHelper)     \
   V(Fmlalt, fmlalt, FourRegOneImmDestructiveHelper)     \
   V(Fmlslb, fmlslb, FourRegOneImmDestructiveHelper)     \
   V(Fmlslt, fmlslt, FourRegOneImmDestructiveHelper)     \
   V(Mla, mla, FourRegOneImmDestructiveHelper)           \
   V(Mls, mls, FourRegOneImmDestructiveHelper)           \
   V(Smlalb, smlalb, FourRegOneImmDestructiveHelper)     \
   V(Smlalt, smlalt, FourRegOneImmDestructiveHelper)     \
   V(Smlslb, smlslb, FourRegOneImmDestructiveHelper)     \
   V(Smlslt, smlslt, FourRegOneImmDestructiveHelper)     \
   V(Sqdmlalb, sqdmlalb, FourRegOneImmDestructiveHelper) \
   V(Sqdmlalt, sqdmlalt, FourRegOneImmDestructiveHelper) \
   V(Sqdmlslb, sqdmlslb, FourRegOneImmDestructiveHelper) \
   V(Sqdmlslt, sqdmlslt, FourRegOneImmDestructiveHelper) \
   V(Sqrdmlah, sqrdmlah, FourRegOneImmDestructiveHelper) \
   V(Sqrdmlsh, sqrdmlsh, FourRegOneImmDestructiveHelper) \
   V(Umlalb, umlalb, FourRegOneImmDestructiveHelper)     \
   V(Umlalt, umlalt, FourRegOneImmDestructiveHelper)     \
   V(Umlslb, umlslb, FourRegOneImmDestructiveHelper)     \
   V(Umlslt, umlslt, FourRegOneImmDestructiveHelper)

 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
   void MacroAssembler::MASMFN(const ZRegister& zd,   \
                               const ZRegister& za,   \
                               const ZRegister& zn,   \
                               const ZRegister& zm,   \
                               int imm) {             \
     VIXL_ASSERT(allow_macro_instructions_);          \
     HELPER(&Assembler::ASMFN, zd, za, zn, zm, imm);  \
   }
 VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC)
 #undef VIXL_DEFINE_MASM_FUNC

 void MacroAssembler::Sdot(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int index) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVEDotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);
 }

 void MacroAssembler::Udot(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int index) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVEDotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);
 }

 void MacroAssembler::Sudot(const ZRegister& zd,
                            const ZRegister& za,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            int index) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVEDotIndexHelper(&Assembler::sudot, zd, za, zn, zm, index);
 }

 void MacroAssembler::Usdot(const ZRegister& zd,
                            const ZRegister& za,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            int index) {
   VIXL_ASSERT(allow_macro_instructions_);
   SVEDotIndexHelper(&Assembler::usdot, zd, za, zn, zm, index);
 }

 void MacroAssembler::Cdot(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int index,
                           int rot) {
   // This doesn't handle zm when it's out of the range that can be encoded in
   // instruction. The range depends on element size: z0-z7 for B, z0-15 for H.
   if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
     UseScratchRegisterScope temps(this);
     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, ztmp, za);
       cdot(ztmp, zn, zm, index, rot);
     }
     Mov(zd, ztmp);
   } else {
     MovprfxHelperScope guard(this, zd, za);
     cdot(zd, zn, zm, index, rot);
   }
 }

 void MacroAssembler::Cdot(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int rot) {
   if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
     UseScratchRegisterScope temps(this);
     VIXL_ASSERT(AreSameLaneSize(zn, zm));
     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
     Mov(ztmp, zd.Aliases(zn) ? zn : zm);
     MovprfxHelperScope guard(this, zd, za);
     cdot(zd, (zd.Aliases(zn) ? ztmp : zn), (zd.Aliases(zm) ? ztmp : zm), rot);
   } else {
     MovprfxHelperScope guard(this, zd, za);
     cdot(zd, zn, zm, rot);
   }
 }

 void MacroAssembler::FPMulAddHelper(const ZRegister& zd,
                                     const PRegisterM& pg,
                                     const ZRegister& za,
                                     const ZRegister& zn,
                                     const ZRegister& zm,
                                     SVEMulAddPredicatedZdaFn fn_zda,
                                     SVEMulAddPredicatedZdnFn fn_zdn,
                                     FPMacroNaNPropagationOption nan_option) {
   ResolveFPNaNPropagationOption(&nan_option);

   if (zd.Aliases(za)) {
     // zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
     SingleEmissionCheckScope guard(this);
     (this->*fn_zda)(zd, pg, zn, zm);
   } else if (zd.Aliases(zn)) {
     // zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.
     SingleEmissionCheckScope guard(this);
     (this->*fn_zdn)(zd, pg, zm, za);
   } else if (zd.Aliases(zm)) {
     switch (nan_option) {
       case FastNaNPropagation: {
         // We treat multiplication as commutative in the fast mode, so we can
         // swap zn and zm.
         // zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.
         SingleEmissionCheckScope guard(this);
         (this->*fn_zdn)(zd, pg, zn, za);
         return;
       }
       case StrictNaNPropagation: {
         UseScratchRegisterScope temps(this);
         // Use a scratch register to keep the argument order exactly as
         // specified.
         ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
         {
           MovprfxHelperScope guard(this, scratch, pg, za);
           // scratch = (-)za + ((-)zn * zm)
           (this->*fn_zda)(scratch, pg, zn, zm);
         }
         Mov(zd, scratch);
         return;
       }
       case NoFPMacroNaNPropagationSelected:
         VIXL_UNREACHABLE();
         return;
     }
   } else {
     // zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
     MovprfxHelperScope guard(this, zd, pg, za);
     (this->*fn_zda)(zd, pg, zn, zm);
   }
 }

 void MacroAssembler::Fmla(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPMulAddHelper(zd,
                  pg,
                  za,
                  zn,
                  zm,
                  &Assembler::fmla,
                  &Assembler::fmad,
                  nan_option);
 }

 void MacroAssembler::Fmls(const ZRegister& zd,
                           const PRegisterM& pg,
                           const ZRegister& za,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPMulAddHelper(zd,
                  pg,
                  za,
                  zn,
                  zm,
                  &Assembler::fmls,
                  &Assembler::fmsb,
                  nan_option);
 }

 void MacroAssembler::Fnmla(const ZRegister& zd,
                            const PRegisterM& pg,
                            const ZRegister& za,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPMulAddHelper(zd,
                  pg,
                  za,
                  zn,
                  zm,
                  &Assembler::fnmla,
                  &Assembler::fnmad,
                  nan_option);
 }

 void MacroAssembler::Fnmls(const ZRegister& zd,
                            const PRegisterM& pg,
                            const ZRegister& za,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            FPMacroNaNPropagationOption nan_option) {
   VIXL_ASSERT(allow_macro_instructions_);
   FPMulAddHelper(zd,
                  pg,
                  za,
                  zn,
                  zm,
                  &Assembler::fnmls,
                  &Assembler::fnmsb,
                  nan_option);
 }

 void MacroAssembler::Ftmad(const ZRegister& zd,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            int imm3) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
     Mov(scratch, zm);
     MovprfxHelperScope guard(this, zd, zn);
     ftmad(zd, zd, scratch, imm3);
   } else {
     MovprfxHelperScope guard(this, zd, zn);
     ftmad(zd, zd, zm, imm3);
   }
 }

 void MacroAssembler::Fcadd(const ZRegister& zd,
                            const PRegisterM& pg,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            int rot) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, scratch, pg, zn);
       fcadd(scratch, pg, scratch, zm, rot);
     }
     Mov(zd, scratch);
   } else {
     MovprfxHelperScope guard(this, zd, pg, zn);
     fcadd(zd, pg, zd, zm, rot);
   }
 }

 void MacroAssembler::Fcmla(const ZRegister& zd,
                            const PRegisterM& pg,
                            const ZRegister& za,
                            const ZRegister& zn,
                            const ZRegister& zm,
                            int rot) {
   VIXL_ASSERT(allow_macro_instructions_);
   if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
     UseScratchRegisterScope temps(this);
     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, ztmp, za);
       fcmla(ztmp, pg, zn, zm, rot);
     }
     Mov(zd, pg, ztmp);
   } else {
     MovprfxHelperScope guard(this, zd, pg, za);
     fcmla(zd, pg, zn, zm, rot);
   }
 }

 void MacroAssembler::Splice(const ZRegister& zd,
                             const PRegister& pg,
                             const ZRegister& zn,
                             const ZRegister& zm) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (CPUHas(CPUFeatures::kSVE2) && AreConsecutive(zn, zm) && !zd.Aliases(zn)) {
     SingleEmissionCheckScope guard(this);
     splice(zd, pg, zn, zm);
   } else if (zd.Aliases(zm) && !zd.Aliases(zn)) {
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, scratch, zn);
       splice(scratch, pg, scratch, zm);
     }
     Mov(zd, scratch);
   } else {
     MovprfxHelperScope guard(this, zd, zn);
     splice(zd, pg, zd, zm);
   }
 }

 void MacroAssembler::Clasta(const ZRegister& zd,
                             const PRegister& pg,
                             const ZRegister& zn,
                             const ZRegister& zm) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, scratch, zn);
       clasta(scratch, pg, scratch, zm);
     }
     Mov(zd, scratch);
   } else {
     MovprfxHelperScope guard(this, zd, zn);
     clasta(zd, pg, zd, zm);
   }
 }

 void MacroAssembler::Clastb(const ZRegister& zd,
                             const PRegister& pg,
                             const ZRegister& zn,
                             const ZRegister& zm) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
     UseScratchRegisterScope temps(this);
     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
     {
       MovprfxHelperScope guard(this, scratch, zn);
       clastb(scratch, pg, scratch, zm);
     }
     Mov(zd, scratch);
   } else {
     MovprfxHelperScope guard(this, zd, zn);
     clastb(zd, pg, zd, zm);
   }
 }

 void MacroAssembler::ShiftRightAccumulate(IntArithImmFn fn,
                                           const ZRegister& zd,
                                           const ZRegister& za,
                                           const ZRegister& zn,
                                           int shift) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (!zd.Aliases(za) && zd.Aliases(zn)) {
     UseScratchRegisterScope temps(this);
     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
     Mov(ztmp, zn);
     {
       MovprfxHelperScope guard(this, zd, za);
       (this->*fn)(zd, ztmp, shift);
     }
   } else {
     MovprfxHelperScope guard(this, zd, za);
     (this->*fn)(zd, zn, shift);
   }
 }

 void MacroAssembler::Srsra(const ZRegister& zd,
                            const ZRegister& za,
                            const ZRegister& zn,
                            int shift) {
   ShiftRightAccumulate(&Assembler::srsra, zd, za, zn, shift);
 }

 void MacroAssembler::Ssra(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           int shift) {
   ShiftRightAccumulate(&Assembler::ssra, zd, za, zn, shift);
 }

 void MacroAssembler::Ursra(const ZRegister& zd,
                            const ZRegister& za,
                            const ZRegister& zn,
                            int shift) {
   ShiftRightAccumulate(&Assembler::ursra, zd, za, zn, shift);
 }

 void MacroAssembler::Usra(const ZRegister& zd,
                           const ZRegister& za,
                           const ZRegister& zn,
                           int shift) {
   ShiftRightAccumulate(&Assembler::usra, zd, za, zn, shift);
 }

 void MacroAssembler::ComplexAddition(ZZZImmFn fn,
                                      const ZRegister& zd,
                                      const ZRegister& zn,
                                      const ZRegister& zm,
                                      int rot) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (!zd.Aliases(zn) && zd.Aliases(zm)) {
     UseScratchRegisterScope temps(this);
     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zm);
     Mov(ztmp, zm);
     {
       MovprfxHelperScope guard(this, zd, zn);
       (this->*fn)(zd, zd, ztmp, rot);
     }
   } else {
     MovprfxHelperScope guard(this, zd, zn);
     (this->*fn)(zd, zd, zm, rot);
   }
 }

 void MacroAssembler::Cadd(const ZRegister& zd,
                           const ZRegister& zn,
                           const ZRegister& zm,
                           int rot) {
   ComplexAddition(&Assembler::cadd, zd, zn, zm, rot);
 }

 void MacroAssembler::Sqcadd(const ZRegister& zd,
                             const ZRegister& zn,
                             const ZRegister& zm,
                             int rot) {
   ComplexAddition(&Assembler::sqcadd, zd, zn, zm, rot);
 }

 }  // namespace aarch64
 }  // namespace vixl