test/aarch64/test-simulator-aarch64.cc - platform/external/vixl - Git at Google

 // Copyright 2015, VIXL authors
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 //   * Redistributions of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //   * Redistributions in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //   * Neither the name of ARM Limited nor the names of its contributors may be
 //     used to endorse or promote products derived from this software without
 //     specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include <cfloat>
 #include <cstdio>
 #include <sstream>

 #include "test-runner.h"
 #include "test-utils.h"

 #include "aarch64/cpu-features-auditor-aarch64.h"
 #include "aarch64/macro-assembler-aarch64.h"
 #include "aarch64/simulator-aarch64.h"
 #include "aarch64/test-simulator-inputs-aarch64.h"
 #include "aarch64/test-simulator-traces-aarch64.h"
 #include "aarch64/test-utils-aarch64.h"

 namespace vixl {
 namespace aarch64 {

 // ==== Simulator Tests ====
 //
 // These simulator tests check instruction behaviour against a trace taken from
 // real AArch64 hardware. The same test code is used to generate the trace; the
 // results are printed to stdout when the test is run with
 // --generate_test_trace.
 //
 // The input lists and expected results are stored in test/traces. The expected
 // results can be regenerated using tools/generate_simulator_traces.py. Adding a
 // test for a new instruction is described at the top of
 // test-simulator-traces-aarch64.h.

 #define __ masm.
 #define TEST(name) TEST_(AARCH64_SIM_##name)

 #define SETUP() SETUP_WITH_FEATURES(CPUFeatures())

 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64

 #define SETUP_WITH_FEATURES(...)                 \
   MacroAssembler masm;                           \
   masm.SetCPUFeatures(CPUFeatures(__VA_ARGS__)); \
   Decoder decoder;                               \
   Simulator simulator(&decoder);                 \
   simulator.SetColouredTrace(Test::coloured_trace());

 #define START()                                                         \
   masm.Reset();                                                         \
   simulator.ResetState();                                               \
   __ PushCalleeSavedRegisters();                                        \
   /* The infrastructure code hasn't been covered at the moment, e.g. */ \
   /* prologue/epilogue. Suppress tagging mis-match exception before  */ \
   /* this point. */                                                     \
   if (masm.GetCPUFeatures()->Has(CPUFeatures::kMTE)) {                  \
     __ Hlt(DebugHltOpcode::kMTEActive);                                 \
   }                                                                     \
   if (Test::trace_reg()) {                                              \
     __ Trace(LOG_STATE, TRACE_ENABLE);                                  \
   }                                                                     \
   if (Test::trace_write()) {                                            \
     __ Trace(LOG_WRITE, TRACE_ENABLE);                                  \
   }                                                                     \
   if (Test::trace_sim()) {                                              \
     __ Trace(LOG_DISASM, TRACE_ENABLE);                                 \
   }

 #define END()                                          \
   if (masm.GetCPUFeatures()->Has(CPUFeatures::kMTE)) { \
     __ Hlt(DebugHltOpcode::kMTEInactive);              \
   }                                                    \
   __ Trace(LOG_ALL, TRACE_DISABLE);                    \
   __ PopCalleeSavedRegisters();                        \
   __ Ret();                                            \
   masm.FinalizeCode()

 #define TRY_RUN(skipped)                                                \
   DISASSEMBLE();                                                        \
   simulator.RunFrom(masm.GetBuffer()->GetStartAddress<Instruction*>()); \
   /* The simulator can run every test. */                               \
   *skipped = false

 #ifdef VIXL_ENABLE_IMPLICIT_CHECKS
 // The signal handler needs access to the simulator.
 Simulator* gImplicitCheckSim;

 #ifdef __x86_64__
 #include <signal.h>
 #include <ucontext.h>
 void HandleSegFault(int sig, siginfo_t* info, void* context) {
   USE(sig);
   USE(info);
   Simulator* sim = gImplicitCheckSim;

   // Did the signal come from the simulator?
   ucontext_t* uc = reinterpret_cast<ucontext_t*>(context);
   uintptr_t fault_pc = uc->uc_mcontext.gregs[REG_RIP];
   VIXL_CHECK(sim->IsSimulatedMemoryAccess(fault_pc));

   // Increment the counter (x1) each time we handle a signal.
   int64_t counter = reinterpret_cast<int64_t>(sim->ReadXRegister(1));
   sim->WriteXRegister(1, ++counter);

   // Return to the VIXL memory access continuation point, which is also the
   // next instruction, after this handler.
   uc->uc_mcontext.gregs[REG_RIP] = sim->GetSignalReturnAddress();
   // Return that the memory access failed.
   uc->uc_mcontext.gregs[REG_RAX] =
       static_cast<greg_t>(MemoryAccessResult::Failure);
 }
 #endif  // __x86_64__

 // Start an implicit check test with a counter and start label so the number of
 // faults can be counted. Note: each instruction after the start will be
 // expected to fault.
 #define START_IMPLICIT_CHECK()                                                \
   gImplicitCheckSim = &simulator;                                             \
   /* Set up a signal handler to count the number of faulting instructions. */ \
   struct sigaction sa;                                                        \
   sa.sa_sigaction = HandleSegFault;                                           \
   sigaction(SIGSEGV, &sa, NULL);                                              \
   START();                                                                    \
   /* Reset the counter. */                                                    \
   __ Mov(x1, 0);                                                              \
   /* Use a consistent bad address. */                                         \
   __ Mov(x15, xzr);                                                           \
   __ Mov(ip0, xzr);                                                           \
   /* Load an amount of data to load. */                                       \
   __ Mov(ip1, 4096);                                                          \
   [[maybe_unused]] MemOperand bad_memory = MemOperand(ip0);                   \
   if (masm.GetCPUFeatures()->Has(CPUFeatures::kSVE)) {                        \
     /* Turn on all lanes to ensure all loads/stores are tested. */            \
     __ Ptrue(p0.VnB());                                                       \
     __ Ptrue(p1.VnB());                                                       \
     __ Ptrue(p2.VnB());                                                       \
     __ Ptrue(p3.VnB());                                                       \
     __ Ptrue(p4.VnB());                                                       \
     __ Ptrue(p5.VnB());                                                       \
     __ Ptrue(p6.VnB());                                                       \
     __ Ptrue(p7.VnB());                                                       \
     __ Ptrue(p8.VnB());                                                       \
     __ Ptrue(p9.VnB());                                                       \
     __ Ptrue(p10.VnB());                                                      \
     __ Ptrue(p11.VnB());                                                      \
     __ Ptrue(p12.VnB());                                                      \
     __ Ptrue(p13.VnB());                                                      \
     __ Ptrue(p14.VnB());                                                      \
     __ Ptrue(p15.VnB());                                                      \
   }                                                                           \
   Label l_start, l_end;                                                       \
   __ Bind(&l_start);

 #define END_IMPLICIT_CHECK() \
   __ Bind(&l_end);           \
   /* Return the counter. */  \
   __ Mov(x0, x1);            \
   END();

 #define TRY_RUN_IMPLICIT_CHECK()                                              \
   bool skipped;                                                               \
   TRY_RUN(&skipped);                                                          \
   /* Implicit checks should only be used with the simulator. */               \
   VIXL_ASSERT(!skipped);                                                      \
   /* Check that each load/store instruction generated a segfault that was */  \
   /* raised and dealt with. */                                                \
   size_t result = simulator.ReadXRegister(0);                                 \
   size_t num_of_faulting_instr = masm.GetSizeOfCodeGeneratedSince(&l_start) - \
                                  masm.GetSizeOfCodeGeneratedSince(&l_end);    \
   VIXL_CHECK((result * kInstructionSize) == num_of_faulting_instr);

 #endif  // VIXL_ENABLE_IMPLICIT_CHECKS

 #else  // VIXL_INCLUDE_SIMULATOR_AARCH64

 #define SETUP_WITH_FEATURES(...)                 \
   MacroAssembler masm;                           \
   masm.SetCPUFeatures(CPUFeatures(__VA_ARGS__)); \
   CPU::SetUp()

 #define START() \
   masm.Reset(); \
   __ PushCalleeSavedRegisters()

 #define END()                   \
   __ PopCalleeSavedRegisters(); \
   __ Ret();                     \
   masm.FinalizeCode()

 #define TRY_RUN(skipped)                                                      \
   DISASSEMBLE();                                                              \
   /* If the test uses features that the current CPU doesn't support, don't */ \
   /* attempt to run it natively.                                           */ \
   {                                                                           \
     Decoder decoder;                                                          \
     /* TODO: Once available, use runtime feature detection. The use of  */    \
     /* AArch64LegacyBaseline is a stopgap.                              */    \
     const CPUFeatures& this_machine = CPUFeatures::AArch64LegacyBaseline();   \
     CPUFeaturesAuditor auditor(&decoder, this_machine);                       \
     CodeBuffer* buffer = masm.GetBuffer();                                    \
     decoder.Decode(buffer->GetStartAddress<Instruction*>(),                   \
                    buffer->GetEndAddress<Instruction*>());                    \
     const CPUFeatures& requirements = auditor.GetSeenFeatures();              \
     if (this_machine.Has(requirements)) {                                     \
       masm.GetBuffer()->SetExecutable();                                      \
       ExecuteMemory(buffer->GetStartAddress<byte*>(),                         \
                     masm.GetSizeOfCodeGenerated());                           \
       masm.GetBuffer()->SetWritable();                                        \
       *skipped = false;                                                       \
     } else {                                                                  \
       std::stringstream os;                                                   \
       /* Note: This message needs to match REGEXP_MISSING_FEATURES from    */ \
       /* tools/threaded_test.py.                                           */ \
       os << "SKIPPED: Missing features: { ";                                  \
       os << requirements.Without(this_machine) << " }\n";                     \
       printf("%s", os.str().c_str());                                         \
       *skipped = true;                                                        \
     }                                                                         \
   }


 #endif  // VIXL_INCLUDE_SIMULATOR_AARCH64


 #define DISASSEMBLE()                                             \
   if (Test::disassemble()) {                                      \
     PrintDisassembler disasm(stdout);                             \
     CodeBuffer* buffer = masm.GetBuffer();                        \
     Instruction* start = buffer->GetStartAddress<Instruction*>(); \
     Instruction* end = buffer->GetEndAddress<Instruction*>();     \
     disasm.DisassembleBuffer(start, end);                         \
   }

 // The maximum number of errors to report in detail for each test.
 static const unsigned kErrorReportLimit = 8;


 // Overloaded versions of RawbitsToDouble and RawbitsToFloat for use in the
 // templated test functions.
 static float rawbits_to_fp(uint32_t bits) { return RawbitsToFloat(bits); }

 static double rawbits_to_fp(uint64_t bits) { return RawbitsToDouble(bits); }

 // The rawbits_to_fp functions are only used for printing decimal values so we
 // just approximate FP16 as double.
 static double rawbits_to_fp(uint16_t bits) {
   return FPToDouble(RawbitsToFloat16(bits), kIgnoreDefaultNaN);
 }


 // MacroAssembler member function pointers to pass to the test dispatchers.
 typedef void (MacroAssembler::*Test1OpFPHelper_t)(const VRegister& fd,
                                                   const VRegister& fn);
 typedef void (MacroAssembler::*Test2OpFPHelper_t)(const VRegister& fd,
                                                   const VRegister& fn,
                                                   const VRegister& fm);
 typedef void (MacroAssembler::*Test3OpFPHelper_t)(const VRegister& fd,
                                                   const VRegister& fn,
                                                   const VRegister& fm,
                                                   const VRegister& fa);
 typedef void (MacroAssembler::*TestFPCmpHelper_t)(const VRegister& fn,
                                                   const VRegister& fm);
 typedef void (MacroAssembler::*TestFPCmpZeroHelper_t)(const VRegister& fn,
                                                       double value);
 typedef void (MacroAssembler::*TestFPToIntHelper_t)(const Register& rd,
                                                     const VRegister& fn);
 typedef void (MacroAssembler::*TestFPToFixedHelper_t)(const Register& rd,
                                                       const VRegister& fn,
                                                       int fbits);
 typedef void (MacroAssembler::*TestFixedToFPHelper_t)(const VRegister& fd,
                                                       const Register& rn,
                                                       int fbits);
 // TODO: 'Test2OpNEONHelper_t' and 'Test2OpFPHelper_t' can be
 //       consolidated into one routine.
 typedef void (MacroAssembler::*Test1OpNEONHelper_t)(const VRegister& vd,
                                                     const VRegister& vn);
 typedef void (MacroAssembler::*Test2OpNEONHelper_t)(const VRegister& vd,
                                                     const VRegister& vn,
                                                     const VRegister& vm);
 typedef void (MacroAssembler::*TestByElementNEONHelper_t)(const VRegister& vd,
                                                           const VRegister& vn,
                                                           const VRegister& vm,
                                                           int vm_index);
 typedef void (MacroAssembler::*TestOpImmOpImmVdUpdateNEONHelper_t)(
     const VRegister& vd, int imm1, const VRegister& vn, int imm2);

 // This helps using the same typename for both the function pointer
 // and the array of immediates passed to helper routines.
 template <typename T>
 class Test2OpImmediateNEONHelper_t {
  public:
   typedef void (MacroAssembler::*mnemonic)(const VRegister& vd,
                                            const VRegister& vn,
                                            T imm);
 };


 // Maximum number of hex characters required to represent values of either
 // templated type.
 template <typename Ta, typename Tb>
 static unsigned MaxHexCharCount() {
   unsigned count = static_cast<unsigned>(std::max(sizeof(Ta), sizeof(Tb)));
   return (count * 8) / 4;
 }


 // Standard test dispatchers.


 static void Test1Op_Helper(Test1OpFPHelper_t helper,
                            uintptr_t inputs,
                            unsigned inputs_length,
                            uintptr_t results,
                            unsigned d_size,
                            unsigned n_size,
                            bool* skipped) {
   VIXL_ASSERT((d_size == kDRegSize) || (d_size == kSRegSize) ||
               (d_size == kHRegSize));
   VIXL_ASSERT((n_size == kDRegSize) || (n_size == kSRegSize) ||
               (n_size == kHRegSize));

   CPUFeatures features;
   features.Combine(CPUFeatures::kFP, CPUFeatures::kFPHalf);
   // For frint{32,64}{x,y} variants.
   features.Combine(CPUFeatures::kFrintToFixedSizedInt);
   SETUP_WITH_FEATURES(features);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n;

   Register out = x0;
   Register inputs_base = x1;
   Register length = w2;
   Register index_n = w3;

   int n_index_shift;
   VRegister fd;
   VRegister fn;
   if (n_size == kDRegSize) {
     n_index_shift = kDRegSizeInBytesLog2;
     fn = d1;
   } else if (n_size == kSRegSize) {
     n_index_shift = kSRegSizeInBytesLog2;
     fn = s1;
   } else {
     n_index_shift = kHRegSizeInBytesLog2;
     fn = h1;
   }

   if (d_size == kDRegSize) {
     fd = d0;
   } else if (d_size == kSRegSize) {
     fd = s0;
   } else {
     fd = h0;
   }


   __ Mov(out, results);
   __ Mov(inputs_base, inputs);
   __ Mov(length, inputs_length);

   __ Mov(index_n, 0);
   __ Bind(&loop_n);
   __ Ldr(fn, MemOperand(inputs_base, index_n, UXTW, n_index_shift));

   {
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(fd, fn);
   }
   __ Str(fd, MemOperand(out, fd.GetSizeInBytes(), PostIndex));

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test FP instructions. The inputs[] and expected[] arrays should be arrays of
 // rawbits representations of doubles or floats. This ensures that exact bit
 // comparisons can be performed.
 template <typename Tn, typename Td>
 static void Test1Op(const char* name,
                     Test1OpFPHelper_t helper,
                     const Tn inputs[],
                     unsigned inputs_length,
                     const Td expected[],
                     unsigned expected_length) {
   VIXL_ASSERT(inputs_length > 0);

   const unsigned results_length = inputs_length;
   Td* results = new Td[results_length];

   const unsigned d_bits = sizeof(Td) * 8;
   const unsigned n_bits = sizeof(Tn) * 8;
   bool skipped;

   Test1Op_Helper(helper,
                  reinterpret_cast<uintptr_t>(inputs),
                  inputs_length,
                  reinterpret_cast<uintptr_t>(results),
                  d_bits,
                  n_bits,
                  &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_%s[] = {\n", d_bits, name);
     for (unsigned d = 0; d < results_length; d++) {
       printf("  0x%0*" PRIx64 ",\n",
              d_bits / 4,
              static_cast<uint64_t>(results[d]));
     }
     printf("};\n");
     printf("const unsigned kExpectedCount_%s = %u;\n", name, results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     for (unsigned n = 0; n < inputs_length; n++, d++) {
       if (results[d] != expected[d]) {
         if (++error_count > kErrorReportLimit) continue;

         printf("%s 0x%0*" PRIx64 " (%s %g):\n",
                name,
                n_bits / 4,
                static_cast<uint64_t>(inputs[n]),
                name,
                rawbits_to_fp(inputs[n]));
         printf("  Expected: 0x%0*" PRIx64 " (%g)\n",
                d_bits / 4,
                static_cast<uint64_t>(expected[d]),
                rawbits_to_fp(expected[d]));
         printf("  Found:    0x%0*" PRIx64 " (%g)\n",
                d_bits / 4,
                static_cast<uint64_t>(results[d]),
                rawbits_to_fp(results[d]));
         printf("\n");
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 static void Test2Op_Helper(Test2OpFPHelper_t helper,
                            uintptr_t inputs,
                            unsigned inputs_length,
                            uintptr_t results,
                            unsigned reg_size,
                            bool* skipped) {
   VIXL_ASSERT((reg_size == kDRegSize) || (reg_size == kSRegSize) ||
               (reg_size == kHRegSize));

   SETUP_WITH_FEATURES(CPUFeatures::kFP, CPUFeatures::kFPHalf);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n, loop_m;

   Register out = x0;
   Register inputs_base = x1;
   Register length = w2;
   Register index_n = w3;
   Register index_m = w4;

   bool double_op = reg_size == kDRegSize;
   bool float_op = reg_size == kSRegSize;
   int index_shift;
   if (double_op) {
     index_shift = kDRegSizeInBytesLog2;
   } else if (float_op) {
     index_shift = kSRegSizeInBytesLog2;
   } else {
     index_shift = kHRegSizeInBytesLog2;
   }

   VRegister fd;
   VRegister fn;
   VRegister fm;

   if (double_op) {
     fd = d0;
     fn = d1;
     fm = d2;
   } else if (float_op) {
     fd = s0;
     fn = s1;
     fm = s2;
   } else {
     fd = h0;
     fn = h1;
     fm = h2;
   }

   __ Mov(out, results);
   __ Mov(inputs_base, inputs);
   __ Mov(length, inputs_length);

   __ Mov(index_n, 0);
   __ Bind(&loop_n);
   __ Ldr(fn, MemOperand(inputs_base, index_n, UXTW, index_shift));

   __ Mov(index_m, 0);
   __ Bind(&loop_m);
   __ Ldr(fm, MemOperand(inputs_base, index_m, UXTW, index_shift));

   {
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(fd, fn, fm);
   }
   __ Str(fd, MemOperand(out, fd.GetSizeInBytes(), PostIndex));

   __ Add(index_m, index_m, 1);
   __ Cmp(index_m, inputs_length);
   __ B(lo, &loop_m);

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test FP instructions. The inputs[] and expected[] arrays should be arrays of
 // rawbits representations of doubles or floats. This ensures that exact bit
 // comparisons can be performed.
 template <typename T>
 static void Test2Op(const char* name,
                     Test2OpFPHelper_t helper,
                     const T inputs[],
                     unsigned inputs_length,
                     const T expected[],
                     unsigned expected_length) {
   VIXL_ASSERT(inputs_length > 0);

   const unsigned results_length = inputs_length * inputs_length;
   T* results = new T[results_length];

   const unsigned bits = sizeof(T) * 8;
   bool skipped;

   Test2Op_Helper(helper,
                  reinterpret_cast<uintptr_t>(inputs),
                  inputs_length,
                  reinterpret_cast<uintptr_t>(results),
                  bits,
                  &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_%s[] = {\n", bits, name);
     for (unsigned d = 0; d < results_length; d++) {
       printf("  0x%0*" PRIx64 ",\n",
              bits / 4,
              static_cast<uint64_t>(results[d]));
     }
     printf("};\n");
     printf("const unsigned kExpectedCount_%s = %u;\n", name, results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     for (unsigned n = 0; n < inputs_length; n++) {
       for (unsigned m = 0; m < inputs_length; m++, d++) {
         if (results[d] != expected[d]) {
           if (++error_count > kErrorReportLimit) continue;

           printf("%s 0x%0*" PRIx64 ", 0x%0*" PRIx64 " (%s %g %g):\n",
                  name,
                  bits / 4,
                  static_cast<uint64_t>(inputs[n]),
                  bits / 4,
                  static_cast<uint64_t>(inputs[m]),
                  name,
                  rawbits_to_fp(inputs[n]),
                  rawbits_to_fp(inputs[m]));
           printf("  Expected: 0x%0*" PRIx64 " (%g)\n",
                  bits / 4,
                  static_cast<uint64_t>(expected[d]),
                  rawbits_to_fp(expected[d]));
           printf("  Found:    0x%0*" PRIx64 " (%g)\n",
                  bits / 4,
                  static_cast<uint64_t>(results[d]),
                  rawbits_to_fp(results[d]));
           printf("\n");
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 static void Test3Op_Helper(Test3OpFPHelper_t helper,
                            uintptr_t inputs,
                            unsigned inputs_length,
                            uintptr_t results,
                            unsigned reg_size,
                            bool* skipped) {
   VIXL_ASSERT((reg_size == kDRegSize) || (reg_size == kSRegSize) ||
               (reg_size == kHRegSize));

   SETUP_WITH_FEATURES(CPUFeatures::kFP, CPUFeatures::kFPHalf);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n, loop_m, loop_a;

   Register out = x0;
   Register inputs_base = x1;
   Register length = w2;
   Register index_n = w3;
   Register index_m = w4;
   Register index_a = w5;

   bool double_op = reg_size == kDRegSize;
   bool single_op = reg_size == kSRegSize;
   int index_shift;
   VRegister fd(0, reg_size);
   VRegister fn(1, reg_size);
   VRegister fm(2, reg_size);
   VRegister fa(3, reg_size);
   if (double_op) {
     index_shift = kDRegSizeInBytesLog2;
   } else if (single_op) {
     index_shift = kSRegSizeInBytesLog2;
   } else {
     index_shift = kHRegSizeInBytesLog2;
   }

   __ Mov(out, results);
   __ Mov(inputs_base, inputs);
   __ Mov(length, inputs_length);

   __ Mov(index_n, 0);
   __ Bind(&loop_n);
   __ Ldr(fn, MemOperand(inputs_base, index_n, UXTW, index_shift));

   __ Mov(index_m, 0);
   __ Bind(&loop_m);
   __ Ldr(fm, MemOperand(inputs_base, index_m, UXTW, index_shift));

   __ Mov(index_a, 0);
   __ Bind(&loop_a);
   __ Ldr(fa, MemOperand(inputs_base, index_a, UXTW, index_shift));

   {
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(fd, fn, fm, fa);
   }
   __ Str(fd, MemOperand(out, fd.GetSizeInBytes(), PostIndex));

   __ Add(index_a, index_a, 1);
   __ Cmp(index_a, inputs_length);
   __ B(lo, &loop_a);

   __ Add(index_m, index_m, 1);
   __ Cmp(index_m, inputs_length);
   __ B(lo, &loop_m);

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test FP instructions. The inputs[] and expected[] arrays should be arrays of
 // rawbits representations of doubles or floats. This ensures that exact bit
 // comparisons can be performed.
 template <typename T>
 static void Test3Op(const char* name,
                     Test3OpFPHelper_t helper,
                     const T inputs[],
                     unsigned inputs_length,
                     const T expected[],
                     unsigned expected_length) {
   VIXL_ASSERT(inputs_length > 0);

   const unsigned results_length = inputs_length * inputs_length * inputs_length;
   T* results = new T[results_length];

   const unsigned bits = sizeof(T) * 8;
   bool skipped;

   Test3Op_Helper(helper,
                  reinterpret_cast<uintptr_t>(inputs),
                  inputs_length,
                  reinterpret_cast<uintptr_t>(results),
                  bits,
                  &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_%s[] = {\n", bits, name);
     for (unsigned d = 0; d < results_length; d++) {
       printf("  0x%0*" PRIx64 ",\n",
              bits / 4,
              static_cast<uint64_t>(results[d]));
     }
     printf("};\n");
     printf("const unsigned kExpectedCount_%s = %u;\n", name, results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     for (unsigned n = 0; n < inputs_length; n++) {
       for (unsigned m = 0; m < inputs_length; m++) {
         for (unsigned a = 0; a < inputs_length; a++, d++) {
           if (results[d] != expected[d]) {
             if (++error_count > kErrorReportLimit) continue;

             printf("%s 0x%0*" PRIx64 ", 0x%0*" PRIx64 ", 0x%0*" PRIx64
                    " (%s %g %g %g):\n",
                    name,
                    bits / 4,
                    static_cast<uint64_t>(inputs[n]),
                    bits / 4,
                    static_cast<uint64_t>(inputs[m]),
                    bits / 4,
                    static_cast<uint64_t>(inputs[a]),
                    name,
                    rawbits_to_fp(inputs[n]),
                    rawbits_to_fp(inputs[m]),
                    rawbits_to_fp(inputs[a]));
             printf("  Expected: 0x%0*" PRIx64 " (%g)\n",
                    bits / 4,
                    static_cast<uint64_t>(expected[d]),
                    rawbits_to_fp(expected[d]));
             printf("  Found:    0x%0*" PRIx64 " (%g)\n",
                    bits / 4,
                    static_cast<uint64_t>(results[d]),
                    rawbits_to_fp(results[d]));
             printf("\n");
           }
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 static void TestCmp_Helper(TestFPCmpHelper_t helper,
                            uintptr_t inputs,
                            unsigned inputs_length,
                            uintptr_t results,
                            unsigned reg_size,
                            bool* skipped) {
   VIXL_ASSERT((reg_size == kDRegSize) || (reg_size == kSRegSize));

   SETUP_WITH_FEATURES(CPUFeatures::kFP);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n, loop_m;

   Register out = x0;
   Register inputs_base = x1;
   Register length = w2;
   Register index_n = w3;
   Register index_m = w4;
   Register flags = x5;

   bool double_op = reg_size == kDRegSize;
   const int index_shift =
       double_op ? kDRegSizeInBytesLog2 : kSRegSizeInBytesLog2;

   VRegister fn = double_op ? d1 : s1;
   VRegister fm = double_op ? d2 : s2;

   __ Mov(out, results);
   __ Mov(inputs_base, inputs);
   __ Mov(length, inputs_length);

   __ Mov(index_n, 0);
   __ Bind(&loop_n);
   __ Ldr(fn, MemOperand(inputs_base, index_n, UXTW, index_shift));

   __ Mov(index_m, 0);
   __ Bind(&loop_m);
   __ Ldr(fm, MemOperand(inputs_base, index_m, UXTW, index_shift));

   {
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(fn, fm);
   }
   __ Mrs(flags, NZCV);
   __ Ubfx(flags, flags, 28, 4);
   __ Strb(flags, MemOperand(out, 1, PostIndex));

   __ Add(index_m, index_m, 1);
   __ Cmp(index_m, inputs_length);
   __ B(lo, &loop_m);

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test FP instructions. The inputs[] and expected[] arrays should be arrays of
 // rawbits representations of doubles or floats. This ensures that exact bit
 // comparisons can be performed.
 template <typename T>
 static void TestCmp(const char* name,
                     TestFPCmpHelper_t helper,
                     const T inputs[],
                     unsigned inputs_length,
                     const uint8_t expected[],
                     unsigned expected_length) {
   VIXL_ASSERT(inputs_length > 0);

   const unsigned results_length = inputs_length * inputs_length;
   uint8_t* results = new uint8_t[results_length];

   const unsigned bits = sizeof(T) * 8;
   bool skipped;

   TestCmp_Helper(helper,
                  reinterpret_cast<uintptr_t>(inputs),
                  inputs_length,
                  reinterpret_cast<uintptr_t>(results),
                  bits,
                  &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint8_t kExpected_%s[] = {\n", name);
     for (unsigned d = 0; d < results_length; d++) {
       // Each NZCV result only requires 4 bits.
       VIXL_ASSERT((results[d] & 0xf) == results[d]);
       printf("  0x%" PRIx8 ",\n", results[d]);
     }
     printf("};\n");
     printf("const unsigned kExpectedCount_%s = %u;\n", name, results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     for (unsigned n = 0; n < inputs_length; n++) {
       for (unsigned m = 0; m < inputs_length; m++, d++) {
         if (results[d] != expected[d]) {
           if (++error_count > kErrorReportLimit) continue;

           printf("%s 0x%0*" PRIx64 ", 0x%0*" PRIx64 " (%s %g %g):\n",
                  name,
                  bits / 4,
                  static_cast<uint64_t>(inputs[n]),
                  bits / 4,
                  static_cast<uint64_t>(inputs[m]),
                  name,
                  rawbits_to_fp(inputs[n]),
                  rawbits_to_fp(inputs[m]));
           printf("  Expected: %c%c%c%c (0x%" PRIx8 ")\n",
                  (expected[d] & 0x8) ? 'N' : 'n',
                  (expected[d] & 0x4) ? 'Z' : 'z',
                  (expected[d] & 0x2) ? 'C' : 'c',
                  (expected[d] & 0x1) ? 'V' : 'v',
                  expected[d]);
           printf("  Found:    %c%c%c%c (0x%" PRIx8 ")\n",
                  (results[d] & 0x8) ? 'N' : 'n',
                  (results[d] & 0x4) ? 'Z' : 'z',
                  (results[d] & 0x2) ? 'C' : 'c',
                  (results[d] & 0x1) ? 'V' : 'v',
                  results[d]);
           printf("\n");
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 static void TestCmpZero_Helper(TestFPCmpZeroHelper_t helper,
                                uintptr_t inputs,
                                unsigned inputs_length,
                                uintptr_t results,
                                unsigned reg_size,
                                bool* skipped) {
   VIXL_ASSERT((reg_size == kDRegSize) || (reg_size == kSRegSize));

   SETUP_WITH_FEATURES(CPUFeatures::kFP);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n, loop_m;

   Register out = x0;
   Register inputs_base = x1;
   Register length = w2;
   Register index_n = w3;
   Register flags = x4;

   bool double_op = reg_size == kDRegSize;
   const int index_shift =
       double_op ? kDRegSizeInBytesLog2 : kSRegSizeInBytesLog2;

   VRegister fn = double_op ? d1 : s1;

   __ Mov(out, results);
   __ Mov(inputs_base, inputs);
   __ Mov(length, inputs_length);

   __ Mov(index_n, 0);
   __ Bind(&loop_n);
   __ Ldr(fn, MemOperand(inputs_base, index_n, UXTW, index_shift));

   {
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(fn, 0.0);
   }
   __ Mrs(flags, NZCV);
   __ Ubfx(flags, flags, 28, 4);
   __ Strb(flags, MemOperand(out, 1, PostIndex));

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test FP instructions. The inputs[] and expected[] arrays should be arrays of
 // rawbits representations of doubles or floats. This ensures that exact bit
 // comparisons can be performed.
 template <typename T>
 static void TestCmpZero(const char* name,
                         TestFPCmpZeroHelper_t helper,
                         const T inputs[],
                         unsigned inputs_length,
                         const uint8_t expected[],
                         unsigned expected_length) {
   VIXL_ASSERT(inputs_length > 0);

   const unsigned results_length = inputs_length;
   uint8_t* results = new uint8_t[results_length];

   const unsigned bits = sizeof(T) * 8;
   bool skipped;

   TestCmpZero_Helper(helper,
                      reinterpret_cast<uintptr_t>(inputs),
                      inputs_length,
                      reinterpret_cast<uintptr_t>(results),
                      bits,
                      &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint8_t kExpected_%s[] = {\n", name);
     for (unsigned d = 0; d < results_length; d++) {
       // Each NZCV result only requires 4 bits.
       VIXL_ASSERT((results[d] & 0xf) == results[d]);
       printf("  0x%" PRIx8 ",\n", results[d]);
     }
     printf("};\n");
     printf("const unsigned kExpectedCount_%s = %u;\n", name, results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     for (unsigned n = 0; n < inputs_length; n++, d++) {
       if (results[d] != expected[d]) {
         if (++error_count > kErrorReportLimit) continue;

         printf("%s 0x%0*" PRIx64 ", 0x%0*u (%s %g #0.0):\n",
                name,
                bits / 4,
                static_cast<uint64_t>(inputs[n]),
                bits / 4,
                0,
                name,
                rawbits_to_fp(inputs[n]));
         printf("  Expected: %c%c%c%c (0x%" PRIx8 ")\n",
                (expected[d] & 0x8) ? 'N' : 'n',
                (expected[d] & 0x4) ? 'Z' : 'z',
                (expected[d] & 0x2) ? 'C' : 'c',
                (expected[d] & 0x1) ? 'V' : 'v',
                expected[d]);
         printf("  Found:    %c%c%c%c (0x%" PRIx8 ")\n",
                (results[d] & 0x8) ? 'N' : 'n',
                (results[d] & 0x4) ? 'Z' : 'z',
                (results[d] & 0x2) ? 'C' : 'c',
                (results[d] & 0x1) ? 'V' : 'v',
                results[d]);
         printf("\n");
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 static void TestFPToFixed_Helper(TestFPToFixedHelper_t helper,
                                  uintptr_t inputs,
                                  unsigned inputs_length,
                                  uintptr_t results,
                                  unsigned d_size,
                                  unsigned n_size,
                                  bool* skipped) {
   VIXL_ASSERT((d_size == kXRegSize) || (d_size == kWRegSize));
   VIXL_ASSERT((n_size == kDRegSize) || (n_size == kSRegSize) ||
               (n_size == kHRegSize));

   SETUP_WITH_FEATURES(CPUFeatures::kFP, CPUFeatures::kFPHalf);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n;

   Register out = x0;
   Register inputs_base = x1;
   Register length = w2;
   Register index_n = w3;

   int n_index_shift;
   if (n_size == kDRegSize) {
     n_index_shift = kDRegSizeInBytesLog2;
   } else if (n_size == kSRegSize) {
     n_index_shift = kSRegSizeInBytesLog2;
   } else {
     n_index_shift = kHRegSizeInBytesLog2;
   }

   Register rd = (d_size == kXRegSize) ? Register(x10) : Register(w10);
   VRegister fn;
   if (n_size == kDRegSize) {
     fn = d1;
   } else if (n_size == kSRegSize) {
     fn = s1;
   } else {
     fn = h1;
   }

   __ Mov(out, results);
   __ Mov(inputs_base, inputs);
   __ Mov(length, inputs_length);

   __ Mov(index_n, 0);
   __ Bind(&loop_n);
   __ Ldr(fn, MemOperand(inputs_base, index_n, UXTW, n_index_shift));

   for (unsigned fbits = 0; fbits <= d_size; ++fbits) {
     {
       SingleEmissionCheckScope guard(&masm);
       (masm.*helper)(rd, fn, fbits);
     }
     __ Str(rd, MemOperand(out, rd.GetSizeInBytes(), PostIndex));
   }

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 static void TestFPToInt_Helper(TestFPToIntHelper_t helper,
                                uintptr_t inputs,
                                unsigned inputs_length,
                                uintptr_t results,
                                unsigned d_size,
                                unsigned n_size,
                                bool* skipped) {
   VIXL_ASSERT((d_size == kXRegSize) || (d_size == kWRegSize));
   VIXL_ASSERT((n_size == kDRegSize) || (n_size == kSRegSize) ||
               (n_size == kHRegSize));

   SETUP_WITH_FEATURES(CPUFeatures::kFP,
                       CPUFeatures::kFPHalf,
                       CPUFeatures::kJSCVT);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n;

   Register out = x0;
   Register inputs_base = x1;
   Register length = w2;
   Register index_n = w3;

   int n_index_shift;
   if (n_size == kDRegSize) {
     n_index_shift = kDRegSizeInBytesLog2;
   } else if (n_size == kSRegSize) {
     n_index_shift = kSRegSizeInBytesLog2;
   } else {
     n_index_shift = kHRegSizeInBytesLog2;
   }

   Register rd = (d_size == kXRegSize) ? Register(x10) : Register(w10);
   VRegister fn;
   if (n_size == kDRegSize) {
     fn = d1;
   } else if (n_size == kSRegSize) {
     fn = s1;
   } else {
     fn = h1;
   }

   __ Mov(out, results);
   __ Mov(inputs_base, inputs);
   __ Mov(length, inputs_length);

   __ Mov(index_n, 0);
   __ Bind(&loop_n);
   __ Ldr(fn, MemOperand(inputs_base, index_n, UXTW, n_index_shift));

   {
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(rd, fn);
   }
   __ Str(rd, MemOperand(out, rd.GetSizeInBytes(), PostIndex));

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test FP instructions.
 //  - The inputs[] array should be an array of rawbits representations of
 //    doubles or floats. This ensures that exact bit comparisons can be
 //    performed.
 //  - The expected[] array should be an array of signed integers.
 template <typename Tn, typename Td>
 static void TestFPToS(const char* name,
                       TestFPToIntHelper_t helper,
                       const Tn inputs[],
                       unsigned inputs_length,
                       const Td expected[],
                       unsigned expected_length) {
   VIXL_ASSERT(inputs_length > 0);

   const unsigned results_length = inputs_length;
   Td* results = new Td[results_length];

   const unsigned d_bits = sizeof(Td) * 8;
   const unsigned n_bits = sizeof(Tn) * 8;
   bool skipped;

   TestFPToInt_Helper(helper,
                      reinterpret_cast<uintptr_t>(inputs),
                      inputs_length,
                      reinterpret_cast<uintptr_t>(results),
                      d_bits,
                      n_bits,
                      &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const int%u_t kExpected_%s[] = {\n", d_bits, name);
     // There is no simple C++ literal for INT*_MIN that doesn't produce
     // warnings, so we use an appropriate constant in that case instead.
     // Deriving int_d_min in this way (rather than just checking INT64_MIN and
     // the like) avoids warnings about comparing values with differing ranges.
     const int64_t int_d_max = (UINT64_C(1) << (d_bits - 1)) - 1;
     const int64_t int_d_min = -(int_d_max)-1;
     for (unsigned d = 0; d < results_length; d++) {
       if (results[d] == int_d_min) {
         printf("  -INT%u_C(%" PRId64 ") - 1,\n", d_bits, int_d_max);
       } else {
         // Some constants (such as those between INT32_MAX and UINT32_MAX)
         // trigger compiler warnings. To avoid these warnings, use an
         // appropriate macro to make the type explicit.
         int64_t result_int64 = static_cast<int64_t>(results[d]);
         if (result_int64 >= 0) {
           printf("  INT%u_C(%" PRId64 "),\n", d_bits, result_int64);
         } else {
           printf("  -INT%u_C(%" PRId64 "),\n", d_bits, -result_int64);
         }
       }
     }
     printf("};\n");
     printf("const unsigned kExpectedCount_%s = %u;\n", name, results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     for (unsigned n = 0; n < inputs_length; n++, d++) {
       if (results[d] != expected[d]) {
         if (++error_count > kErrorReportLimit) continue;

         printf("%s 0x%0*" PRIx64 " (%s %g):\n",
                name,
                n_bits / 4,
                static_cast<uint64_t>(inputs[n]),
                name,
                rawbits_to_fp(inputs[n]));
         printf("  Expected: 0x%0*" PRIx64 " (%" PRId64 ")\n",
                d_bits / 4,
                static_cast<uint64_t>(expected[d]),
                static_cast<int64_t>(expected[d]));
         printf("  Found:    0x%0*" PRIx64 " (%" PRId64 ")\n",
                d_bits / 4,
                static_cast<uint64_t>(results[d]),
                static_cast<int64_t>(results[d]));
         printf("\n");
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // Test FP instructions.
 //  - The inputs[] array should be an array of rawbits representations of
 //    doubles or floats. This ensures that exact bit comparisons can be
 //    performed.
 //  - The expected[] array should be an array of unsigned integers.
 template <typename Tn, typename Td>
 static void TestFPToU(const char* name,
                       TestFPToIntHelper_t helper,
                       const Tn inputs[],
                       unsigned inputs_length,
                       const Td expected[],
                       unsigned expected_length) {
   VIXL_ASSERT(inputs_length > 0);

   const unsigned results_length = inputs_length;
   Td* results = new Td[results_length];

   const unsigned d_bits = sizeof(Td) * 8;
   const unsigned n_bits = sizeof(Tn) * 8;
   bool skipped;

   TestFPToInt_Helper(helper,
                      reinterpret_cast<uintptr_t>(inputs),
                      inputs_length,
                      reinterpret_cast<uintptr_t>(results),
                      d_bits,
                      n_bits,
                      &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_%s[] = {\n", d_bits, name);
     for (unsigned d = 0; d < results_length; d++) {
       printf("  %" PRIu64 "u,\n", static_cast<uint64_t>(results[d]));
     }
     printf("};\n");
     printf("const unsigned kExpectedCount_%s = %u;\n", name, results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     for (unsigned n = 0; n < inputs_length; n++, d++) {
       if (results[d] != expected[d]) {
         if (++error_count > kErrorReportLimit) continue;

         printf("%s 0x%0*" PRIx64 " (%s %g):\n",
                name,
                n_bits / 4,
                static_cast<uint64_t>(inputs[n]),
                name,
                rawbits_to_fp(inputs[n]));
         printf("  Expected: 0x%0*" PRIx64 " (%" PRIu64 ")\n",
                d_bits / 4,
                static_cast<uint64_t>(expected[d]),
                static_cast<uint64_t>(expected[d]));
         printf("  Found:    0x%0*" PRIx64 " (%" PRIu64 ")\n",
                d_bits / 4,
                static_cast<uint64_t>(results[d]),
                static_cast<uint64_t>(results[d]));
         printf("\n");
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // Test FP instructions.
 //  - The inputs[] array should be an array of rawbits representations of
 //    doubles or floats. This ensures that exact bit comparisons can be
 //    performed.
 //  - The expected[] array should be an array of signed integers.
 template <typename Tn, typename Td>
 static void TestFPToFixedS(const char* name,
                            TestFPToFixedHelper_t helper,
                            const Tn inputs[],
                            unsigned inputs_length,
                            const Td expected[],
                            unsigned expected_length) {
   VIXL_ASSERT(inputs_length > 0);

   const unsigned d_bits = sizeof(Td) * 8;
   const unsigned n_bits = sizeof(Tn) * 8;

   const unsigned results_length = inputs_length * (d_bits + 1);
   Td* results = new Td[results_length];

   bool skipped;

   TestFPToFixed_Helper(helper,
                        reinterpret_cast<uintptr_t>(inputs),
                        inputs_length,
                        reinterpret_cast<uintptr_t>(results),
                        d_bits,
                        n_bits,
                        &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const int%u_t kExpected_%s[] = {\n", d_bits, name);
     // There is no simple C++ literal for INT*_MIN that doesn't produce
     // warnings, so we use an appropriate constant in that case instead.
     // Deriving int_d_min in this way (rather than just checking INT64_MIN and
     // the like) avoids warnings about comparing values with differing ranges.
     const int64_t int_d_max = (UINT64_C(1) << (d_bits - 1)) - 1;
     const int64_t int_d_min = -(int_d_max)-1;
     for (unsigned d = 0; d < results_length; d++) {
       if (results[d] == int_d_min) {
         printf("  -INT%u_C(%" PRId64 ") - 1,\n", d_bits, int_d_max);
       } else {
         // Some constants (such as those between INT32_MAX and UINT32_MAX)
         // trigger compiler warnings. To avoid these warnings, use an
         // appropriate macro to make the type explicit.
         int64_t result_int64 = static_cast<int64_t>(results[d]);
         if (result_int64 >= 0) {
           printf("  INT%u_C(%" PRId64 "),\n", d_bits, result_int64);
         } else {
           printf("  -INT%u_C(%" PRId64 "),\n", d_bits, -result_int64);
         }
       }
     }
     printf("};\n");
     printf("const unsigned kExpectedCount_%s = %u;\n", name, results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     for (unsigned n = 0; n < inputs_length; n++) {
       for (unsigned fbits = 0; fbits <= d_bits; ++fbits, d++) {
         if (results[d] != expected[d]) {
           if (++error_count > kErrorReportLimit) continue;

           printf("%s 0x%0*" PRIx64 " #%d (%s %g #%d):\n",
                  name,
                  n_bits / 4,
                  static_cast<uint64_t>(inputs[n]),
                  fbits,
                  name,
                  rawbits_to_fp(inputs[n]),
                  fbits);
           printf("  Expected: 0x%0*" PRIx64 " (%" PRId64 ")\n",
                  d_bits / 4,
                  static_cast<uint64_t>(expected[d]),
                  static_cast<int64_t>(expected[d]));
           printf("  Found:    0x%0*" PRIx64 " (%" PRId64 ")\n",
                  d_bits / 4,
                  static_cast<uint64_t>(results[d]),
                  static_cast<int64_t>(results[d]));
           printf("\n");
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // Test FP instructions.
 //  - The inputs[] array should be an array of rawbits representations of
 //    doubles or floats. This ensures that exact bit comparisons can be
 //    performed.
 //  - The expected[] array should be an array of unsigned integers.
 template <typename Tn, typename Td>
 static void TestFPToFixedU(const char* name,
                            TestFPToFixedHelper_t helper,
                            const Tn inputs[],
                            unsigned inputs_length,
                            const Td expected[],
                            unsigned expected_length) {
   VIXL_ASSERT(inputs_length > 0);

   const unsigned d_bits = sizeof(Td) * 8;
   const unsigned n_bits = sizeof(Tn) * 8;

   const unsigned results_length = inputs_length * (d_bits + 1);
   Td* results = new Td[results_length];

   bool skipped;

   TestFPToFixed_Helper(helper,
                        reinterpret_cast<uintptr_t>(inputs),
                        inputs_length,
                        reinterpret_cast<uintptr_t>(results),
                        d_bits,
                        n_bits,
                        &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_%s[] = {\n", d_bits, name);
     for (unsigned d = 0; d < results_length; d++) {
       printf("  %" PRIu64 "u,\n", static_cast<uint64_t>(results[d]));
     }
     printf("};\n");
     printf("const unsigned kExpectedCount_%s = %u;\n", name, results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     for (unsigned n = 0; n < inputs_length; n++) {
       for (unsigned fbits = 0; fbits <= d_bits; ++fbits, d++) {
         if (results[d] != expected[d]) {
           if (++error_count > kErrorReportLimit) continue;

           printf("%s 0x%0*" PRIx64 " #%d (%s %g #%d):\n",
                  name,
                  n_bits / 4,
                  static_cast<uint64_t>(inputs[n]),
                  fbits,
                  name,
                  rawbits_to_fp(inputs[n]),
                  fbits);
           printf("  Expected: 0x%0*" PRIx64 " (%" PRIu64 ")\n",
                  d_bits / 4,
                  static_cast<uint64_t>(expected[d]),
                  static_cast<uint64_t>(expected[d]));
           printf("  Found:    0x%0*" PRIx64 " (%" PRIu64 ")\n",
                  d_bits / 4,
                  static_cast<uint64_t>(results[d]),
                  static_cast<uint64_t>(results[d]));
           printf("\n");
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // ==== Tests for instructions of the form <INST> VReg, VReg. ====


 static void Test1OpNEON_Helper(Test1OpNEONHelper_t helper,
                                uintptr_t inputs_n,
                                unsigned inputs_n_length,
                                uintptr_t results,
                                VectorFormat vd_form,
                                VectorFormat vn_form,
                                bool* skipped) {
   VIXL_ASSERT(vd_form != kFormatUndefined);
   VIXL_ASSERT(vn_form != kFormatUndefined);

   CPUFeatures features;
   features.Combine(CPUFeatures::kNEON,
                    CPUFeatures::kFP,
                    CPUFeatures::kRDM,
                    CPUFeatures::kNEONHalf);
   // For frint{32,64}{x,y} variants.
   features.Combine(CPUFeatures::kFrintToFixedSizedInt);
   SETUP_WITH_FEATURES(features);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n;

   Register out = x0;
   Register inputs_n_base = x1;
   Register inputs_n_last_16bytes = x3;
   Register index_n = x5;

   // TODO: Refactor duplicate definitions below with a VRegister::As() routine.
   const unsigned vd_bits = RegisterSizeInBitsFromFormat(vd_form);
   const unsigned vd_lane_count = LaneCountFromFormat(vd_form);

   const unsigned vn_bits = RegisterSizeInBitsFromFormat(vn_form);
   const unsigned vn_lane_count = LaneCountFromFormat(vn_form);
   const unsigned vn_lane_bytes = LaneSizeInBytesFromFormat(vn_form);
   const unsigned vn_lane_bytes_log2 = LaneSizeInBytesLog2FromFormat(vn_form);
   const unsigned vn_lane_bits = LaneSizeInBitsFromFormat(vn_form);


   // These will be either a D- or a Q-register form, with a single lane
   // (for use in scalar load and store operations).
   VRegister vd = VRegister(0, vd_bits);
   VRegister vn = v1.V16B();
   VRegister vntmp = v3.V16B();

   // These will have the correct format for use when calling 'helper'.
   VRegister vd_helper = VRegister(0, vd_bits, vd_lane_count);
   VRegister vn_helper = VRegister(1, vn_bits, vn_lane_count);

   // 'v*tmp_single' will be either 'Vt.B', 'Vt.H', 'Vt.S' or 'Vt.D'.
   VRegister vntmp_single = VRegister(3, vn_lane_bits);

   __ Mov(out, results);

   __ Mov(inputs_n_base, inputs_n);
   __ Mov(inputs_n_last_16bytes,
          inputs_n + (vn_lane_bytes * inputs_n_length) - 16);

   __ Ldr(vn, MemOperand(inputs_n_last_16bytes));

   __ Mov(index_n, 0);
   __ Bind(&loop_n);

   __ Ldr(vntmp_single,
          MemOperand(inputs_n_base, index_n, LSL, vn_lane_bytes_log2));
   __ Ext(vn, vn, vntmp, vn_lane_bytes);

   // Set the destination to zero.
   // TODO: Setting the destination to values other than zero
   //       might be a better test for instructions such as sqxtn2
   //       which may leave parts of V registers unchanged.
   __ Movi(vd.V16B(), 0);

   {
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(vd_helper, vn_helper);
   }
   __ Str(vd, MemOperand(out, vd.GetSizeInBytes(), PostIndex));

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_n_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test NEON instructions. The inputs_*[] and expected[] arrays should be
 // arrays of rawbit representation of input values. This ensures that
 // exact bit comparisons can be performed.
 template <typename Td, typename Tn>
 static void Test1OpNEON(const char* name,
                         Test1OpNEONHelper_t helper,
                         const Tn inputs_n[],
                         unsigned inputs_n_length,
                         const Td expected[],
                         unsigned expected_length,
                         VectorFormat vd_form,
                         VectorFormat vn_form) {
   VIXL_ASSERT(inputs_n_length > 0);

   const unsigned vd_lane_count = LaneCountFromFormat(vd_form);
   const unsigned vn_lane_bytes = LaneSizeInBytesFromFormat(vn_form);
   const unsigned vn_lane_count = LaneCountFromFormat(vn_form);

   const unsigned results_length = inputs_n_length;
   Td* results = new Td[results_length * vd_lane_count];
   const unsigned lane_bit = sizeof(Td) * 8;
   const unsigned lane_len_in_hex = MaxHexCharCount<Td, Tn>();

   bool skipped;

   Test1OpNEON_Helper(helper,
                      reinterpret_cast<uintptr_t>(inputs_n),
                      inputs_n_length,
                      reinterpret_cast<uintptr_t>(results),
                      vd_form,
                      vn_form,
                      &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_NEON_%s[] = {\n", lane_bit, name);
     for (unsigned iteration = 0; iteration < results_length; iteration++) {
       printf(" ");
       // Output a separate result for each element of the result vector.
       for (unsigned lane = 0; lane < vd_lane_count; lane++) {
         unsigned index = lane + (iteration * vd_lane_count);
         printf(" 0x%0*" PRIx64 ",",
                lane_len_in_hex,
                static_cast<uint64_t>(results[index]));
       }
       printf("\n");
     }

     printf("};\n");
     printf("const unsigned kExpectedCount_NEON_%s = %u;\n",
            name,
            results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     const char* padding = "                    ";
     VIXL_ASSERT(strlen(padding) >= (lane_len_in_hex + 1));
     for (unsigned n = 0; n < inputs_n_length; n++, d++) {
       bool error_in_vector = false;

       for (unsigned lane = 0; lane < vd_lane_count; lane++) {
         unsigned output_index = (n * vd_lane_count) + lane;

         if (results[output_index] != expected[output_index]) {
           error_in_vector = true;
           break;
         }
       }

       if (error_in_vector && (++error_count <= kErrorReportLimit)) {
         printf("%s\n", name);
         printf(" Vn%.*s| Vd%.*s| Expected\n",
                lane_len_in_hex + 1,
                padding,
                lane_len_in_hex + 1,
                padding);

         const unsigned first_index_n =
             inputs_n_length - (16 / vn_lane_bytes) + n + 1;

         for (unsigned lane = 0; lane < std::max(vd_lane_count, vn_lane_count);
              lane++) {
           unsigned output_index = (n * vd_lane_count) + lane;
           unsigned input_index_n = (first_index_n + lane) % inputs_n_length;

           printf("%c0x%0*" PRIx64 " | 0x%0*" PRIx64
                  " "
                  "| 0x%0*" PRIx64 "\n",
                  results[output_index] != expected[output_index] ? '*' : ' ',
                  lane_len_in_hex,
                  static_cast<uint64_t>(inputs_n[input_index_n]),
                  lane_len_in_hex,
                  static_cast<uint64_t>(results[output_index]),
                  lane_len_in_hex,
                  static_cast<uint64_t>(expected[output_index]));
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // ==== Tests for instructions of the form <mnemonic> <V><d>, <Vn>.<T> ====
 //      where <V> is one of B, H, S or D registers.
 //      e.g. saddlv H1, v0.8B

 // TODO: Change tests to store all lanes of the resulting V register.
 //       Some tests store all 128 bits of the resulting V register to
 //       check the simulator's behaviour on the rest of the register.
 //       This is better than storing the affected lanes only.
 //       Change any tests such as the 'Across' template to do the same.

 static void Test1OpAcrossNEON_Helper(Test1OpNEONHelper_t helper,
                                      uintptr_t inputs_n,
                                      unsigned inputs_n_length,
                                      uintptr_t results,
                                      VectorFormat vd_form,
                                      VectorFormat vn_form,
                                      bool* skipped) {
   VIXL_ASSERT(vd_form != kFormatUndefined);
   VIXL_ASSERT(vn_form != kFormatUndefined);

   SETUP_WITH_FEATURES(CPUFeatures::kNEON,
                       CPUFeatures::kFP,
                       CPUFeatures::kNEONHalf);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n;

   Register out = x0;
   Register inputs_n_base = x1;
   Register inputs_n_last_vector = x3;
   Register index_n = x5;

   // TODO: Refactor duplicate definitions below with a VRegister::As() routine.
   const unsigned vd_bits = RegisterSizeInBitsFromFormat(vd_form);
   const unsigned vn_bits = RegisterSizeInBitsFromFormat(vn_form);
   const unsigned vn_lane_count = LaneCountFromFormat(vn_form);
   const unsigned vn_lane_bytes = LaneSizeInBytesFromFormat(vn_form);
   const unsigned vn_lane_bytes_log2 = LaneSizeInBytesLog2FromFormat(vn_form);
   const unsigned vn_lane_bits = LaneSizeInBitsFromFormat(vn_form);

   // Test destructive operations by (arbitrarily) using the same register for
   // B and S lane sizes.
   bool destructive = (vd_bits == kBRegSize) || (vd_bits == kSRegSize);

   // Create two aliases for v0; the first is the destination for the tested
   // instruction, the second, the whole Q register to check the results.
   VRegister vd = VRegister(0, vd_bits);
   VRegister vdstr = VRegister(0, kQRegSize);

   VRegister vn = VRegister(1, vn_bits);
   VRegister vntmp = VRegister(3, vn_bits);

   // These will have the correct format for use when calling 'helper'.
   VRegister vd_helper = VRegister(0, vn_bits, vn_lane_count);
   VRegister vn_helper = VRegister(1, vn_bits, vn_lane_count);

   // 'v*tmp_single' will be either 'Vt.B', 'Vt.H', 'Vt.S' or 'Vt.D'.
   VRegister vntmp_single = VRegister(3, vn_lane_bits);

   // Same registers for use in the 'ext' instructions.
   VRegister vn_ext = (kDRegSize == vn_bits) ? vn.V8B() : vn.V16B();
   VRegister vntmp_ext = (kDRegSize == vn_bits) ? vntmp.V8B() : vntmp.V16B();

   __ Mov(out, results);

   __ Mov(inputs_n_base, inputs_n);
   __ Mov(inputs_n_last_vector,
          inputs_n + vn_lane_bytes * (inputs_n_length - vn_lane_count));

   __ Ldr(vn, MemOperand(inputs_n_last_vector));

   __ Mov(index_n, 0);
   __ Bind(&loop_n);

   __ Ldr(vntmp_single,
          MemOperand(inputs_n_base, index_n, LSL, vn_lane_bytes_log2));
   __ Ext(vn_ext, vn_ext, vntmp_ext, vn_lane_bytes);

   if (destructive) {
     __ Mov(vd_helper, vn_helper);
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(vd, vd_helper);
   } else {
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(vd, vn_helper);
   }

   __ Str(vdstr, MemOperand(out, kQRegSizeInBytes, PostIndex));

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_n_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }

 // Test NEON instructions. The inputs_*[] and expected[] arrays should be
 // arrays of rawbit representation of input values. This ensures that
 // exact bit comparisons can be performed.
 template <typename Td, typename Tn>
 static void Test1OpAcrossNEON(const char* name,
                               Test1OpNEONHelper_t helper,
                               const Tn inputs_n[],
                               unsigned inputs_n_length,
                               const Td expected[],
                               unsigned expected_length,
                               VectorFormat vd_form,
                               VectorFormat vn_form) {
   VIXL_ASSERT(inputs_n_length > 0);

   const unsigned vd_lane_count = LaneCountFromFormat(vd_form);
   const unsigned vd_lanes_per_q = MaxLaneCountFromFormat(vd_form);

   const unsigned results_length = inputs_n_length;
   Td* results = new Td[results_length * vd_lanes_per_q];
   const unsigned lane_bit = sizeof(Td) * 8;
   const unsigned lane_len_in_hex = MaxHexCharCount<Td, Tn>();

   bool skipped;

   Test1OpAcrossNEON_Helper(helper,
                            reinterpret_cast<uintptr_t>(inputs_n),
                            inputs_n_length,
                            reinterpret_cast<uintptr_t>(results),
                            vd_form,
                            vn_form,
                            &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_NEON_%s[] = {\n", lane_bit, name);
     for (unsigned iteration = 0; iteration < results_length; iteration++) {
       printf(" ");
       // Output a separate result for each element of the result vector.
       for (unsigned lane = 0; lane < vd_lane_count; lane++) {
         unsigned index = lane + (iteration * vd_lanes_per_q);
         printf(" 0x%0*" PRIx64 ",",
                lane_len_in_hex,
                static_cast<uint64_t>(results[index]));
       }
       printf("\n");
     }

     printf("};\n");
     printf("const unsigned kExpectedCount_NEON_%s = %u;\n",
            name,
            results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     const char* padding = "                    ";
     VIXL_ASSERT(strlen(padding) >= (lane_len_in_hex + 1));
     for (unsigned n = 0; n < inputs_n_length; n++, d++) {
       bool error_in_vector = false;

       for (unsigned lane = 0; lane < vd_lane_count; lane++) {
         unsigned expected_index = (n * vd_lane_count) + lane;
         unsigned results_index = (n * vd_lanes_per_q) + lane;

         if (results[results_index] != expected[expected_index]) {
           error_in_vector = true;
           break;
         }
       }

       // For across operations, the remaining lanes should be zero.
       for (unsigned lane = vd_lane_count; lane < vd_lanes_per_q; lane++) {
         unsigned results_index = (n * vd_lanes_per_q) + lane;
         if (results[results_index] != 0) {
           error_in_vector = true;
           break;
         }
       }

       if (error_in_vector && (++error_count <= kErrorReportLimit)) {
         const unsigned vn_lane_count = LaneCountFromFormat(vn_form);

         printf("%s\n", name);
         printf(" Vn%.*s| Vd%.*s| Expected\n",
                lane_len_in_hex + 1,
                padding,
                lane_len_in_hex + 1,
                padding);

         // TODO: In case of an error, all tests print out as many elements as
         //       there are lanes in the output or input vectors. This way
         //       the viewer can read all the values that were needed for the
         //       operation but the output contains also unnecessary values.
         //       These prints can be improved according to the arguments
         //       passed to test functions.
         //       This output for the 'Across' category has the required
         //       modifications.
         for (unsigned lane = 0; lane < vn_lane_count; lane++) {
           unsigned results_index =
               (n * vd_lanes_per_q) + ((vn_lane_count - 1) - lane);
           unsigned input_index_n =
               (inputs_n_length - vn_lane_count + n + 1 + lane) %
               inputs_n_length;

           Td expect = 0;
           if ((vn_lane_count - 1) == lane) {
             // This is the last lane to be printed, ie. the least-significant
             // lane, so use the expected value; any other lane should be zero.
             unsigned expected_index = n * vd_lane_count;
             expect = expected[expected_index];
           }
           printf("%c0x%0*" PRIx64 " | 0x%0*" PRIx64 " | 0x%0*" PRIx64 "\n",
                  results[results_index] != expect ? '*' : ' ',
                  lane_len_in_hex,
                  static_cast<uint64_t>(inputs_n[input_index_n]),
                  lane_len_in_hex,
                  static_cast<uint64_t>(results[results_index]),
                  lane_len_in_hex,
                  static_cast<uint64_t>(expect));
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // ==== Tests for instructions of the form <INST> VReg, VReg, VReg. ====

 // TODO: Iterate over inputs_d once the traces file is split.

 static void Test2OpNEON_Helper(Test2OpNEONHelper_t helper,
                                uintptr_t inputs_d,
                                uintptr_t inputs_n,
                                unsigned inputs_n_length,
                                uintptr_t inputs_m,
                                unsigned inputs_m_length,
                                uintptr_t results,
                                VectorFormat vd_form,
                                VectorFormat vn_form,
                                VectorFormat vm_form,
                                bool* skipped) {
   VIXL_ASSERT(vd_form != kFormatUndefined);
   VIXL_ASSERT(vn_form != kFormatUndefined);
   VIXL_ASSERT(vm_form != kFormatUndefined);

   CPUFeatures features;
   features.Combine(CPUFeatures::kNEON, CPUFeatures::kNEONHalf);
   features.Combine(CPUFeatures::kFP);
   features.Combine(CPUFeatures::kRDM);
   features.Combine(CPUFeatures::kDotProduct);
   features.Combine(CPUFeatures::kFHM);
   SETUP_WITH_FEATURES(features);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n, loop_m;

   Register out = x0;
   Register inputs_n_base = x1;
   Register inputs_m_base = x2;
   Register inputs_d_base = x3;
   Register inputs_n_last_16bytes = x4;
   Register inputs_m_last_16bytes = x5;
   Register index_n = x6;
   Register index_m = x7;

   // TODO: Refactor duplicate definitions below with a VRegister::As() routine.
   const unsigned vd_bits = RegisterSizeInBitsFromFormat(vd_form);
   const unsigned vd_lane_count = LaneCountFromFormat(vd_form);

   const unsigned vn_bits = RegisterSizeInBitsFromFormat(vn_form);
   const unsigned vn_lane_count = LaneCountFromFormat(vn_form);
   const unsigned vn_lane_bytes = LaneSizeInBytesFromFormat(vn_form);
   const unsigned vn_lane_bytes_log2 = LaneSizeInBytesLog2FromFormat(vn_form);
   const unsigned vn_lane_bits = LaneSizeInBitsFromFormat(vn_form);

   const unsigned vm_bits = RegisterSizeInBitsFromFormat(vm_form);
   const unsigned vm_lane_count = LaneCountFromFormat(vm_form);
   const unsigned vm_lane_bytes = LaneSizeInBytesFromFormat(vm_form);
   const unsigned vm_lane_bytes_log2 = LaneSizeInBytesLog2FromFormat(vm_form);
   const unsigned vm_lane_bits = LaneSizeInBitsFromFormat(vm_form);


   // Always load and store 128 bits regardless of the format.
   VRegister vd = v0.V16B();
   VRegister vn = v1.V16B();
   VRegister vm = v2.V16B();
   VRegister vntmp = v3.V16B();
   VRegister vmtmp = v4.V16B();
   VRegister vres = v5.V16B();

   // These will have the correct format for calling the 'helper'.
   VRegister vn_helper = VRegister(1, vn_bits, vn_lane_count);
   VRegister vm_helper = VRegister(2, vm_bits, vm_lane_count);
   VRegister vres_helper = VRegister(5, vd_bits, vd_lane_count);

   // 'v*tmp_single' will be either 'Vt.B', 'Vt.H', 'Vt.S' or 'Vt.D'.
   VRegister vntmp_single = VRegister(3, vn_lane_bits);
   VRegister vmtmp_single = VRegister(4, vm_lane_bits);

   __ Mov(out, results);

   __ Mov(inputs_d_base, inputs_d);

   __ Mov(inputs_n_base, inputs_n);
   __ Mov(inputs_n_last_16bytes, inputs_n + (inputs_n_length - 16));
   __ Mov(inputs_m_base, inputs_m);
   __ Mov(inputs_m_last_16bytes, inputs_m + (inputs_m_length - 16));

   __ Ldr(vd, MemOperand(inputs_d_base));
   __ Ldr(vn, MemOperand(inputs_n_last_16bytes));
   __ Ldr(vm, MemOperand(inputs_m_last_16bytes));

   __ Mov(index_n, 0);
   __ Bind(&loop_n);

   __ Ldr(vntmp_single,
          MemOperand(inputs_n_base, index_n, LSL, vn_lane_bytes_log2));
   __ Ext(vn, vn, vntmp, vn_lane_bytes);

   __ Mov(index_m, 0);
   __ Bind(&loop_m);

   __ Ldr(vmtmp_single,
          MemOperand(inputs_m_base, index_m, LSL, vm_lane_bytes_log2));
   __ Ext(vm, vm, vmtmp, vm_lane_bytes);

   __ Mov(vres, vd);
   {
     SingleEmissionCheckScope guard(&masm);
     (masm.*helper)(vres_helper, vn_helper, vm_helper);
   }
   __ Str(vres, MemOperand(out, vd.GetSizeInBytes(), PostIndex));

   __ Add(index_m, index_m, 1);
   __ Cmp(index_m, inputs_m_length);
   __ B(lo, &loop_m);

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_n_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test NEON instructions. The inputs_*[] and expected[] arrays should be
 // arrays of rawbit representation of input values. This ensures that
 // exact bit comparisons can be performed.
 template <typename Td, typename Tn, typename Tm>
 static void Test2OpNEON(const char* name,
                         Test2OpNEONHelper_t helper,
                         const Td inputs_d[],
                         const Tn inputs_n[],
                         unsigned inputs_n_length,
                         const Tm inputs_m[],
                         unsigned inputs_m_length,
                         const Td expected[],
                         unsigned expected_length,
                         VectorFormat vd_form,
                         VectorFormat vn_form,
                         VectorFormat vm_form) {
   VIXL_ASSERT(inputs_n_length > 0 && inputs_m_length > 0);

   const unsigned vd_lane_count = MaxLaneCountFromFormat(vd_form);

   const unsigned results_length = inputs_n_length * inputs_m_length;
   Td* results = new Td[results_length * vd_lane_count];
   const unsigned lane_bit = sizeof(Td) * 8;
   const unsigned lane_len_in_hex = MaxHexCharCount<Td, Tm>();

   bool skipped;

   Test2OpNEON_Helper(helper,
                      reinterpret_cast<uintptr_t>(inputs_d),
                      reinterpret_cast<uintptr_t>(inputs_n),
                      inputs_n_length,
                      reinterpret_cast<uintptr_t>(inputs_m),
                      inputs_m_length,
                      reinterpret_cast<uintptr_t>(results),
                      vd_form,
                      vn_form,
                      vm_form,
                      &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_NEON_%s[] = {\n", lane_bit, name);
     for (unsigned iteration = 0; iteration < results_length; iteration++) {
       printf(" ");
       // Output a separate result for each element of the result vector.
       for (unsigned lane = 0; lane < vd_lane_count; lane++) {
         unsigned index = lane + (iteration * vd_lane_count);
         printf(" 0x%0*" PRIx64 ",",
                lane_len_in_hex,
                static_cast<uint64_t>(results[index]));
       }
       printf("\n");
     }

     printf("};\n");
     printf("const unsigned kExpectedCount_NEON_%s = %u;\n",
            name,
            results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     const char* padding = "                    ";
     VIXL_ASSERT(strlen(padding) >= (lane_len_in_hex + 1));
     for (unsigned n = 0; n < inputs_n_length; n++) {
       for (unsigned m = 0; m < inputs_m_length; m++, d++) {
         bool error_in_vector = false;

         for (unsigned lane = 0; lane < vd_lane_count; lane++) {
           unsigned output_index = (n * inputs_m_length * vd_lane_count) +
                                   (m * vd_lane_count) + lane;

           if (results[output_index] != expected[output_index]) {
             error_in_vector = true;
             break;
           }
         }

         if (error_in_vector && (++error_count <= kErrorReportLimit)) {
           printf("%s\n", name);
           printf(" Vd%.*s| Vn%.*s| Vm%.*s| Vd%.*s| Expected\n",
                  lane_len_in_hex + 1,
                  padding,
                  lane_len_in_hex + 1,
                  padding,
                  lane_len_in_hex + 1,
                  padding,
                  lane_len_in_hex + 1,
                  padding);

           for (unsigned lane = 0; lane < vd_lane_count; lane++) {
             unsigned output_index = (n * inputs_m_length * vd_lane_count) +
                                     (m * vd_lane_count) + lane;
             unsigned input_index_n =
                 (inputs_n_length - vd_lane_count + n + 1 + lane) %
                 inputs_n_length;
             unsigned input_index_m =
                 (inputs_m_length - vd_lane_count + m + 1 + lane) %
                 inputs_m_length;

             printf("%c0x%0*" PRIx64 " | 0x%0*" PRIx64 " | 0x%0*" PRIx64
                    " "
                    "| 0x%0*" PRIx64 " | 0x%0*" PRIx64 "\n",
                    results[output_index] != expected[output_index] ? '*' : ' ',
                    lane_len_in_hex,
                    static_cast<uint64_t>(inputs_d[lane]),
                    lane_len_in_hex,
                    static_cast<uint64_t>(inputs_n[input_index_n]),
                    lane_len_in_hex,
                    static_cast<uint64_t>(inputs_m[input_index_m]),
                    lane_len_in_hex,
                    static_cast<uint64_t>(results[output_index]),
                    lane_len_in_hex,
                    static_cast<uint64_t>(expected[output_index]));
           }
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // ==== Tests for instructions of the form <INST> Vd, Vn, Vm[<#index>]. ====

 static void TestByElementNEON_Helper(TestByElementNEONHelper_t helper,
                                      uintptr_t inputs_d,
                                      uintptr_t inputs_n,
                                      unsigned inputs_n_length,
                                      uintptr_t inputs_m,
                                      unsigned inputs_m_length,
                                      const int indices[],
                                      unsigned indices_length,
                                      uintptr_t results,
                                      VectorFormat vd_form,
                                      VectorFormat vn_form,
                                      VectorFormat vm_form,
                                      unsigned vm_subvector_count,
                                      bool* skipped) {
   VIXL_ASSERT(vd_form != kFormatUndefined);
   VIXL_ASSERT(vn_form != kFormatUndefined);
   VIXL_ASSERT(vm_form != kFormatUndefined);
   VIXL_ASSERT((vm_subvector_count != 0) && IsPowerOf2(vm_subvector_count));

   CPUFeatures features;
   features.Combine(CPUFeatures::kNEON, CPUFeatures::kNEONHalf);
   features.Combine(CPUFeatures::kFP);
   features.Combine(CPUFeatures::kRDM);
   features.Combine(CPUFeatures::kDotProduct);
   features.Combine(CPUFeatures::kFHM);
   SETUP_WITH_FEATURES(features);

   START();

   // Roll up the loop to keep the code size down.
   Label loop_n, loop_m;

   Register out = x0;
   Register inputs_n_base = x1;
   Register inputs_m_base = x2;
   Register inputs_d_base = x3;
   Register inputs_n_last_16bytes = x4;
   Register inputs_m_last_16bytes = x5;
   Register index_n = x6;
   Register index_m = x7;

   // TODO: Refactor duplicate definitions below with a VRegister::As() routine.
   const unsigned vd_bits = RegisterSizeInBitsFromFormat(vd_form);
   const unsigned vd_lane_count = LaneCountFromFormat(vd_form);

   const unsigned vn_bits = RegisterSizeInBitsFromFormat(vn_form);
   const unsigned vn_lane_count = LaneCountFromFormat(vn_form);
   const unsigned vn_lane_bytes = LaneSizeInBytesFromFormat(vn_form);
   const unsigned vn_lane_bytes_log2 = LaneSizeInBytesLog2FromFormat(vn_form);
   const unsigned vn_lane_bits = LaneSizeInBitsFromFormat(vn_form);

   const unsigned vm_bits = RegisterSizeInBitsFromFormat(vm_form);
   const unsigned vm_lane_count = LaneCountFromFormat(vm_form);
   const unsigned vm_lane_bytes = LaneSizeInBytesFromFormat(vm_form);
   const unsigned vm_lane_bytes_log2 = LaneSizeInBytesLog2FromFormat(vm_form);
   const unsigned vm_lane_bits = LaneSizeInBitsFromFormat(vm_form);

   VIXL_ASSERT((vm_bits * vm_subvector_count) <= kQRegSize);

   // Always load and store 128 bits regardless of the format.
   VRegister vd = v0.V16B();
   VRegister vn = v1.V16B();
   VRegister vm = v2.V16B();
   VRegister vntmp = v3.V16B();
   VRegister vmtmp = v4.V16B();
   VRegister vres = v5.V16B();

   // These will have the correct format for calling the 'helper'.
   VRegister vn_helper = VRegister(1, vn_bits, vn_lane_count);
   VRegister vm_helper =
       VRegister(2, vm_bits * vm_subvector_count, vm_lane_count);
   VRegister vres_helper = VRegister(5, vd_bits, vd_lane_count);

   // 'v*tmp_single' will be either 'Vt.B', 'Vt.H', 'Vt.S' or 'Vt.D'.
   VRegister vntmp_single = VRegister(3, vn_lane_bits);
   VRegister vmtmp_single = VRegister(4, vm_lane_bits);

   __ Mov(out, results);

   __ Mov(inputs_d_base, inputs_d);

   __ Mov(inputs_n_base, inputs_n);
   __ Mov(inputs_n_last_16bytes, inputs_n + (inputs_n_length - 16));
   __ Mov(inputs_m_base, inputs_m);
   __ Mov(inputs_m_last_16bytes, inputs_m + (inputs_m_length - 16));

   __ Ldr(vd, MemOperand(inputs_d_base));
   __ Ldr(vn, MemOperand(inputs_n_last_16bytes));
   __ Ldr(vm, MemOperand(inputs_m_last_16bytes));

   __ Mov(index_n, 0);
   __ Bind(&loop_n);

   __ Ldr(vntmp_single,
          MemOperand(inputs_n_base, index_n, LSL, vn_lane_bytes_log2));
   __ Ext(vn, vn, vntmp, vn_lane_bytes);

   __ Mov(index_m, 0);
   __ Bind(&loop_m);

   __ Ldr(vmtmp_single,
          MemOperand(inputs_m_base, index_m, LSL, vm_lane_bytes_log2));
   __ Ext(vm, vm, vmtmp, vm_lane_bytes);

   __ Mov(vres, vd);
   {
     for (unsigned i = 0; i < indices_length; i++) {
       {
         SingleEmissionCheckScope guard(&masm);
         (masm.*helper)(vres_helper, vn_helper, vm_helper, indices[i]);
       }
       __ Str(vres, MemOperand(out, vd.GetSizeInBytes(), PostIndex));
     }
   }

   __ Add(index_m, index_m, 1);
   __ Cmp(index_m, inputs_m_length);
   __ B(lo, &loop_m);

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_n_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test NEON instructions. The inputs_*[] and expected[] arrays should be
 // arrays of rawbit representation of input values. This ensures that
 // exact bit comparisons can be performed.
 template <typename Td, typename Tn, typename Tm>
 static void TestByElementNEON(const char* name,
                               TestByElementNEONHelper_t helper,
                               const Td inputs_d[],
                               const Tn inputs_n[],
                               unsigned inputs_n_length,
                               const Tm inputs_m[],
                               unsigned inputs_m_length,
                               const int indices[],
                               unsigned indices_length,
                               const Td expected[],
                               unsigned expected_length,
                               VectorFormat vd_form,
                               VectorFormat vn_form,
                               VectorFormat vm_form,
                               unsigned vm_subvector_count = 1) {
   VIXL_ASSERT(inputs_n_length > 0);
   VIXL_ASSERT(inputs_m_length > 0);
   VIXL_ASSERT(indices_length > 0);

   const unsigned vd_lane_count = MaxLaneCountFromFormat(vd_form);

   const unsigned results_length =
       inputs_n_length * inputs_m_length * indices_length;
   Td* results = new Td[results_length * vd_lane_count];
   const unsigned lane_bit = sizeof(Td) * 8;
   const unsigned lane_len_in_hex = MaxHexCharCount<Td, Tm>();

   bool skipped;

   TestByElementNEON_Helper(helper,
                            reinterpret_cast<uintptr_t>(inputs_d),
                            reinterpret_cast<uintptr_t>(inputs_n),
                            inputs_n_length,
                            reinterpret_cast<uintptr_t>(inputs_m),
                            inputs_m_length,
                            indices,
                            indices_length,
                            reinterpret_cast<uintptr_t>(results),
                            vd_form,
                            vn_form,
                            vm_form,
                            vm_subvector_count,
                            &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_NEON_%s[] = {\n", lane_bit, name);
     for (unsigned iteration = 0; iteration < results_length; iteration++) {
       printf(" ");
       // Output a separate result for each element of the result vector.
       for (unsigned lane = 0; lane < vd_lane_count; lane++) {
         unsigned index = lane + (iteration * vd_lane_count);
         printf(" 0x%0*" PRIx64 ",",
                lane_len_in_hex,
                static_cast<uint64_t>(results[index]));
       }
       printf("\n");
     }

     printf("};\n");
     printf("const unsigned kExpectedCount_NEON_%s = %u;\n",
            name,
            results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     const char* padding = "                    ";
     VIXL_ASSERT(strlen(padding) >= (lane_len_in_hex + 1));
     for (unsigned n = 0; n < inputs_n_length; n++) {
       for (unsigned m = 0; m < inputs_m_length; m++) {
         for (unsigned index = 0; index < indices_length; index++, d++) {
           bool error_in_vector = false;

           for (unsigned lane = 0; lane < vd_lane_count; lane++) {
             unsigned output_index =
                 (n * inputs_m_length * indices_length * vd_lane_count) +
                 (m * indices_length * vd_lane_count) + (index * vd_lane_count) +
                 lane;

             if (results[output_index] != expected[output_index]) {
               error_in_vector = true;
               break;
             }
           }

           if (error_in_vector && (++error_count <= kErrorReportLimit)) {
             printf("%s\n", name);
             printf(" Vd%.*s| Vn%.*s| Vm%.*s| Index | Vd%.*s| Expected\n",
                    lane_len_in_hex + 1,
                    padding,
                    lane_len_in_hex + 1,
                    padding,
                    lane_len_in_hex + 1,
                    padding,
                    lane_len_in_hex + 1,
                    padding);

             for (unsigned lane = 0; lane < vd_lane_count; lane++) {
               unsigned output_index =
                   (n * inputs_m_length * indices_length * vd_lane_count) +
                   (m * indices_length * vd_lane_count) +
                   (index * vd_lane_count) + lane;
               unsigned input_index_n =
                   (inputs_n_length - vd_lane_count + n + 1 + lane) %
                   inputs_n_length;
               unsigned input_index_m =
                   (inputs_m_length - vd_lane_count + m + 1 + lane) %
                   inputs_m_length;

               printf("%c0x%0*" PRIx64 " | 0x%0*" PRIx64 " | 0x%0*" PRIx64
                      " "
                      "| [%3d] | 0x%0*" PRIx64 " | 0x%0*" PRIx64 "\n",
                      results[output_index] != expected[output_index] ? '*'
                                                                      : ' ',
                      lane_len_in_hex,
                      static_cast<uint64_t>(inputs_d[lane]),
                      lane_len_in_hex,
                      static_cast<uint64_t>(inputs_n[input_index_n]),
                      lane_len_in_hex,
                      static_cast<uint64_t>(inputs_m[input_index_m]),
                      indices[index],
                      lane_len_in_hex,
                      static_cast<uint64_t>(results[output_index]),
                      lane_len_in_hex,
                      static_cast<uint64_t>(expected[output_index]));
             }
           }
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // ==== Tests for instructions of the form <INST> VReg, VReg, #Immediate. ====


 template <typename Tm>
 void Test2OpImmNEON_Helper(
     typename Test2OpImmediateNEONHelper_t<Tm>::mnemonic helper,
     uintptr_t inputs_n,
     unsigned inputs_n_length,
     const Tm inputs_m[],
     unsigned inputs_m_length,
     uintptr_t results,
     VectorFormat vd_form,
     VectorFormat vn_form,
     bool* skipped) {
   VIXL_ASSERT(vd_form != kFormatUndefined && vn_form != kFormatUndefined);

   SETUP_WITH_FEATURES(CPUFeatures::kNEON,
                       CPUFeatures::kFP,
                       CPUFeatures::kNEONHalf);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n;

   Register out = x0;
   Register inputs_n_base = x1;
   Register inputs_n_last_16bytes = x3;
   Register index_n = x5;

   // TODO: Refactor duplicate definitions below with a VRegister::As() routine.
   const unsigned vd_bits = RegisterSizeInBitsFromFormat(vd_form);
   const unsigned vd_lane_count = LaneCountFromFormat(vd_form);

   const unsigned vn_bits = RegisterSizeInBitsFromFormat(vn_form);
   const unsigned vn_lane_count = LaneCountFromFormat(vn_form);
   const unsigned vn_lane_bytes = LaneSizeInBytesFromFormat(vn_form);
   const unsigned vn_lane_bytes_log2 = LaneSizeInBytesLog2FromFormat(vn_form);
   const unsigned vn_lane_bits = LaneSizeInBitsFromFormat(vn_form);


   // These will be either a D- or a Q-register form, with a single lane
   // (for use in scalar load and store operations).
   VRegister vd = VRegister(0, vd_bits);
   VRegister vn = v1.V16B();
   VRegister vntmp = v3.V16B();

   // These will have the correct format for use when calling 'helper'.
   VRegister vd_helper = VRegister(0, vd_bits, vd_lane_count);
   VRegister vn_helper = VRegister(1, vn_bits, vn_lane_count);

   // 'v*tmp_single' will be either 'Vt.B', 'Vt.H', 'Vt.S' or 'Vt.D'.
   VRegister vntmp_single = VRegister(3, vn_lane_bits);

   __ Mov(out, results);

   __ Mov(inputs_n_base, inputs_n);
   __ Mov(inputs_n_last_16bytes,
          inputs_n + (vn_lane_bytes * inputs_n_length) - 16);

   __ Ldr(vn, MemOperand(inputs_n_last_16bytes));

   __ Mov(index_n, 0);
   __ Bind(&loop_n);

   __ Ldr(vntmp_single,
          MemOperand(inputs_n_base, index_n, LSL, vn_lane_bytes_log2));
   __ Ext(vn, vn, vntmp, vn_lane_bytes);

   // Set the destination to zero for tests such as '[r]shrn2'.
   // TODO: Setting the destination to values other than zero might be a better
   //       test for shift and accumulate instructions (srsra/ssra/usra/ursra).
   __ Movi(vd.V16B(), 0);

   {
     for (unsigned i = 0; i < inputs_m_length; i++) {
       {
         SingleEmissionCheckScope guard(&masm);
         (masm.*helper)(vd_helper, vn_helper, inputs_m[i]);
       }
       __ Str(vd, MemOperand(out, vd.GetSizeInBytes(), PostIndex));
     }
   }

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_n_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test NEON instructions. The inputs_*[] and expected[] arrays should be
 // arrays of rawbit representation of input values. This ensures that
 // exact bit comparisons can be performed.
 template <typename Td, typename Tn, typename Tm>
 static void Test2OpImmNEON(
     const char* name,
     typename Test2OpImmediateNEONHelper_t<Tm>::mnemonic helper,
     const Tn inputs_n[],
     unsigned inputs_n_length,
     const Tm inputs_m[],
     unsigned inputs_m_length,
     const Td expected[],
     unsigned expected_length,
     VectorFormat vd_form,
     VectorFormat vn_form) {
   VIXL_ASSERT(inputs_n_length > 0 && inputs_m_length > 0);

   const unsigned vd_lane_count = LaneCountFromFormat(vd_form);
   const unsigned vn_lane_bytes = LaneSizeInBytesFromFormat(vn_form);
   const unsigned vn_lane_count = LaneCountFromFormat(vn_form);

   const unsigned results_length = inputs_n_length * inputs_m_length;
   Td* results = new Td[results_length * vd_lane_count];
   const unsigned lane_bit = sizeof(Td) * 8;
   const unsigned lane_len_in_hex = MaxHexCharCount<Td, Tn>();

   bool skipped;

   Test2OpImmNEON_Helper(helper,
                         reinterpret_cast<uintptr_t>(inputs_n),
                         inputs_n_length,
                         inputs_m,
                         inputs_m_length,
                         reinterpret_cast<uintptr_t>(results),
                         vd_form,
                         vn_form,
                         &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_NEON_%s[] = {\n", lane_bit, name);
     for (unsigned iteration = 0; iteration < results_length; iteration++) {
       printf(" ");
       // Output a separate result for each element of the result vector.
       for (unsigned lane = 0; lane < vd_lane_count; lane++) {
         unsigned index = lane + (iteration * vd_lane_count);
         printf(" 0x%0*" PRIx64 ",",
                lane_len_in_hex,
                static_cast<uint64_t>(results[index]));
       }
       printf("\n");
     }

     printf("};\n");
     printf("const unsigned kExpectedCount_NEON_%s = %u;\n",
            name,
            results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned d = 0;
     const char* padding = "                    ";
     VIXL_ASSERT(strlen(padding) >= (lane_len_in_hex + 1));
     for (unsigned n = 0; n < inputs_n_length; n++) {
       for (unsigned m = 0; m < inputs_m_length; m++, d++) {
         bool error_in_vector = false;

         for (unsigned lane = 0; lane < vd_lane_count; lane++) {
           unsigned output_index = (n * inputs_m_length * vd_lane_count) +
                                   (m * vd_lane_count) + lane;

           if (results[output_index] != expected[output_index]) {
             error_in_vector = true;
             break;
           }
         }

         if (error_in_vector && (++error_count <= kErrorReportLimit)) {
           printf("%s\n", name);
           printf(" Vn%.*s| Imm%.*s| Vd%.*s| Expected\n",
                  lane_len_in_hex + 1,
                  padding,
                  lane_len_in_hex,
                  padding,
                  lane_len_in_hex + 1,
                  padding);

           const unsigned first_index_n =
               inputs_n_length - (16 / vn_lane_bytes) + n + 1;

           for (unsigned lane = 0; lane < std::max(vd_lane_count, vn_lane_count);
                lane++) {
             unsigned output_index = (n * inputs_m_length * vd_lane_count) +
                                     (m * vd_lane_count) + lane;
             unsigned input_index_n = (first_index_n + lane) % inputs_n_length;
             unsigned input_index_m = m;

             printf("%c0x%0*" PRIx64 " | 0x%0*" PRIx64
                    " "
                    "| 0x%0*" PRIx64 " | 0x%0*" PRIx64 "\n",
                    results[output_index] != expected[output_index] ? '*' : ' ',
                    lane_len_in_hex,
                    static_cast<uint64_t>(inputs_n[input_index_n]),
                    lane_len_in_hex,
                    static_cast<uint64_t>(inputs_m[input_index_m]),
                    lane_len_in_hex,
                    static_cast<uint64_t>(results[output_index]),
                    lane_len_in_hex,
                    static_cast<uint64_t>(expected[output_index]));
           }
         }
       }
     }
     VIXL_ASSERT(d == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // ==== Tests for instructions of the form <INST> VReg, #Imm, VReg, #Imm. ====


 static void TestOpImmOpImmNEON_Helper(TestOpImmOpImmVdUpdateNEONHelper_t helper,
                                       uintptr_t inputs_d,
                                       const int inputs_imm1[],
                                       unsigned inputs_imm1_length,
                                       uintptr_t inputs_n,
                                       unsigned inputs_n_length,
                                       const int inputs_imm2[],
                                       unsigned inputs_imm2_length,
                                       uintptr_t results,
                                       VectorFormat vd_form,
                                       VectorFormat vn_form,
                                       bool* skipped) {
   VIXL_ASSERT(vd_form != kFormatUndefined);
   VIXL_ASSERT(vn_form != kFormatUndefined);

   SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kFP);
   START();

   // Roll up the loop to keep the code size down.
   Label loop_n;

   Register out = x0;
   Register inputs_d_base = x1;
   Register inputs_n_base = x2;
   Register inputs_n_last_vector = x4;
   Register index_n = x6;

   // TODO: Refactor duplicate definitions below with a VRegister::As() routine.
   const unsigned vd_bits = RegisterSizeInBitsFromFormat(vd_form);
   const unsigned vd_lane_count = LaneCountFromFormat(vd_form);

   const unsigned vn_bits = RegisterSizeInBitsFromFormat(vn_form);
   const unsigned vn_lane_count = LaneCountFromFormat(vn_form);
   const unsigned vn_lane_bytes = LaneSizeInBytesFromFormat(vn_form);
   const unsigned vn_lane_bytes_log2 = LaneSizeInBytesLog2FromFormat(vn_form);
   const unsigned vn_lane_bits = LaneSizeInBitsFromFormat(vn_form);


   // These will be either a D- or a Q-register form, with a single lane
   // (for use in scalar load and store operations).
   VRegister vd = VRegister(0, vd_bits);
   VRegister vn = VRegister(1, vn_bits);
   VRegister vntmp = VRegister(4, vn_bits);
   VRegister vres = VRegister(5, vn_bits);

   VRegister vn_helper = VRegister(1, vn_bits, vn_lane_count);
   VRegister vres_helper = VRegister(5, vd_bits, vd_lane_count);

   // 'v*tmp_single' will be either 'Vt.B', 'Vt.H', 'Vt.S' or 'Vt.D'.
   VRegister vntmp_single = VRegister(4, vn_lane_bits);

   // Same registers for use in the 'ext' instructions.
   VRegister vn_ext = (kDRegSize == vn_bits) ? vn.V8B() : vn.V16B();
   VRegister vntmp_ext = (kDRegSize == vn_bits) ? vntmp.V8B() : vntmp.V16B();

   __ Mov(out, results);

   __ Mov(inputs_d_base, inputs_d);

   __ Mov(inputs_n_base, inputs_n);
   __ Mov(inputs_n_last_vector,
          inputs_n + vn_lane_bytes * (inputs_n_length - vn_lane_count));

   __ Ldr(vd, MemOperand(inputs_d_base));

   __ Ldr(vn, MemOperand(inputs_n_last_vector));

   __ Mov(index_n, 0);
   __ Bind(&loop_n);

   __ Ldr(vntmp_single,
          MemOperand(inputs_n_base, index_n, LSL, vn_lane_bytes_log2));
   __ Ext(vn_ext, vn_ext, vntmp_ext, vn_lane_bytes);

   {
     EmissionCheckScope guard(&masm,
                              kInstructionSize * inputs_imm1_length *
                                  inputs_imm2_length * 3);
     for (unsigned i = 0; i < inputs_imm1_length; i++) {
       for (unsigned j = 0; j < inputs_imm2_length; j++) {
         __ Mov(vres, vd);
         (masm.*helper)(vres_helper, inputs_imm1[i], vn_helper, inputs_imm2[j]);
         __ Str(vres, MemOperand(out, vd.GetSizeInBytes(), PostIndex));
       }
     }
   }

   __ Add(index_n, index_n, 1);
   __ Cmp(index_n, inputs_n_length);
   __ B(lo, &loop_n);

   END();
   TRY_RUN(skipped);
 }


 // Test NEON instructions. The inputs_*[] and expected[] arrays should be
 // arrays of rawbit representation of input values. This ensures that
 // exact bit comparisons can be performed.
 template <typename Td, typename Tn>
 static void TestOpImmOpImmNEON(const char* name,
                                TestOpImmOpImmVdUpdateNEONHelper_t helper,
                                const Td inputs_d[],
                                const int inputs_imm1[],
                                unsigned inputs_imm1_length,
                                const Tn inputs_n[],
                                unsigned inputs_n_length,
                                const int inputs_imm2[],
                                unsigned inputs_imm2_length,
                                const Td expected[],
                                unsigned expected_length,
                                VectorFormat vd_form,
                                VectorFormat vn_form) {
   VIXL_ASSERT(inputs_n_length > 0);
   VIXL_ASSERT(inputs_imm1_length > 0);
   VIXL_ASSERT(inputs_imm2_length > 0);

   const unsigned vd_lane_count = LaneCountFromFormat(vd_form);

   const unsigned results_length =
       inputs_n_length * inputs_imm1_length * inputs_imm2_length;

   Td* results = new Td[results_length * vd_lane_count];
   const unsigned lane_bit = sizeof(Td) * 8;
   const unsigned lane_len_in_hex = MaxHexCharCount<Td, Tn>();

   bool skipped;

   TestOpImmOpImmNEON_Helper(helper,
                             reinterpret_cast<uintptr_t>(inputs_d),
                             inputs_imm1,
                             inputs_imm1_length,
                             reinterpret_cast<uintptr_t>(inputs_n),
                             inputs_n_length,
                             inputs_imm2,
                             inputs_imm2_length,
                             reinterpret_cast<uintptr_t>(results),
                             vd_form,
                             vn_form,
                             &skipped);

   if (Test::generate_test_trace()) {
     // Print the results.
     printf("const uint%u_t kExpected_NEON_%s[] = {\n", lane_bit, name);
     for (unsigned iteration = 0; iteration < results_length; iteration++) {
       printf(" ");
       // Output a separate result for each element of the result vector.
       for (unsigned lane = 0; lane < vd_lane_count; lane++) {
         unsigned index = lane + (iteration * vd_lane_count);
         printf(" 0x%0*" PRIx64 ",",
                lane_len_in_hex,
                static_cast<uint64_t>(results[index]));
       }
       printf("\n");
     }

     printf("};\n");
     printf("const unsigned kExpectedCount_NEON_%s = %u;\n",
            name,
            results_length);
   } else if (!skipped) {
     // Check the results.
     VIXL_CHECK(expected_length == results_length);
     unsigned error_count = 0;
     unsigned counted_length = 0;
     const char* padding = "                    ";
     VIXL_ASSERT(strlen(padding) >= (lane_len_in_hex + 1));
     for (unsigned n = 0; n < inputs_n_length; n++) {
       for (unsigned imm1 = 0; imm1 < inputs_imm1_length; imm1++) {
         for (unsigned imm2 = 0; imm2 < inputs_imm2_length; imm2++) {
           bool error_in_vector = false;

           counted_length++;

           for (unsigned lane = 0; lane < vd_lane_count; lane++) {
             unsigned output_index =
                 (n * inputs_imm1_length * inputs_imm2_length * vd_lane_count) +
                 (imm1 * inputs_imm2_length * vd_lane_count) +
                 (imm2 * vd_lane_count) + lane;

             if (results[output_index] != expected[output_index]) {
               error_in_vector = true;
               break;
             }
           }

           if (error_in_vector && (++error_count <= kErrorReportLimit)) {
             printf("%s\n", name);
             printf(" Vd%.*s| Imm%.*s| Vn%.*s| Imm%.*s| Vd%.*s| Expected\n",
                    lane_len_in_hex + 1,
                    padding,
                    lane_len_in_hex,
                    padding,
                    lane_len_in_hex + 1,
                    padding,
                    lane_len_in_hex,
                    padding,
                    lane_len_in_hex + 1,
                    padding);

             for (unsigned lane = 0; lane < vd_lane_count; lane++) {
               unsigned output_index =
                   (n * inputs_imm1_length * inputs_imm2_length *
                    vd_lane_count) +
                   (imm1 * inputs_imm2_length * vd_lane_count) +
                   (imm2 * vd_lane_count) + lane;
               unsigned input_index_n =
                   (inputs_n_length - vd_lane_count + n + 1 + lane) %
                   inputs_n_length;
               unsigned input_index_imm1 = imm1;
               unsigned input_index_imm2 = imm2;

               printf("%c0x%0*" PRIx64 " | 0x%0*" PRIx64 " | 0x%0*" PRIx64
                      " "
                      "| 0x%0*" PRIx64 " | 0x%0*" PRIx64 " | 0x%0*" PRIx64 "\n",
                      results[output_index] != expected[output_index] ? '*'
                                                                      : ' ',
                      lane_len_in_hex,
                      static_cast<uint64_t>(inputs_d[lane]),
                      lane_len_in_hex,
                      static_cast<uint64_t>(inputs_imm1[input_index_imm1]),
                      lane_len_in_hex,
                      static_cast<uint64_t>(inputs_n[input_index_n]),
                      lane_len_in_hex,
                      static_cast<uint64_t>(inputs_imm2[input_index_imm2]),
                      lane_len_in_hex,
                      static_cast<uint64_t>(results[output_index]),
                      lane_len_in_hex,
                      static_cast<uint64_t>(expected[output_index]));
             }
           }
         }
       }
     }
     VIXL_CHECK(counted_length == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
     VIXL_CHECK(error_count == 0);
   }
   delete[] results;
 }


 // ==== Floating-point tests. ====


 // Standard floating-point test expansion for both double- and single-precision
 // operations.
 #define STRINGIFY(s) #s

 #define CALL_TEST_FP_HELPER(mnemonic, variant, type, input) \
   Test##type(STRINGIFY(mnemonic) "_" STRINGIFY(variant),    \
              &MacroAssembler::mnemonic,                     \
              input,                                         \
              sizeof(input) / sizeof(input[0]),              \
              kExpected_##mnemonic##_##variant,              \
              kExpectedCount_##mnemonic##_##variant)

 #define DEFINE_TEST_FP(mnemonic, type, input)                    \
   TEST(mnemonic##_d) {                                           \
     CALL_TEST_FP_HELPER(mnemonic, d, type, kInputDouble##input); \
   }                                                              \
   TEST(mnemonic##_s) {                                           \
     CALL_TEST_FP_HELPER(mnemonic, s, type, kInputFloat##input);  \
   }

 #define DEFINE_TEST_FP_FP16(mnemonic, type, input)                \
   TEST(mnemonic##_d) {                                            \
     CALL_TEST_FP_HELPER(mnemonic, d, type, kInputDouble##input);  \
   }                                                               \
   TEST(mnemonic##_s) {                                            \
     CALL_TEST_FP_HELPER(mnemonic, s, type, kInputFloat##input);   \
   }                                                               \
   TEST(mnemonic##_h) {                                            \
     CALL_TEST_FP_HELPER(mnemonic, h, type, kInputFloat16##input); \
   }


 // TODO: Test with a newer version of valgrind.
 //
 // Note: valgrind-3.10.0 does not properly interpret libm's fma() on x86_64.
 // Therefore this test will be exiting though an ASSERT and thus leaking
 // memory.
 DEFINE_TEST_FP_FP16(fmadd, 3Op, Basic)
 DEFINE_TEST_FP_FP16(fmsub, 3Op, Basic)
 DEFINE_TEST_FP_FP16(fnmadd, 3Op, Basic)
 DEFINE_TEST_FP_FP16(fnmsub, 3Op, Basic)

 DEFINE_TEST_FP_FP16(fadd, 2Op, Basic)
 DEFINE_TEST_FP_FP16(fdiv, 2Op, Basic)
 DEFINE_TEST_FP_FP16(fmax, 2Op, Basic)
 DEFINE_TEST_FP_FP16(fmaxnm, 2Op, Basic)
 DEFINE_TEST_FP_FP16(fmin, 2Op, Basic)
 DEFINE_TEST_FP_FP16(fminnm, 2Op, Basic)
 DEFINE_TEST_FP_FP16(fmul, 2Op, Basic)
 DEFINE_TEST_FP_FP16(fsub, 2Op, Basic)
 DEFINE_TEST_FP_FP16(fnmul, 2Op, Basic)

 DEFINE_TEST_FP_FP16(fabs, 1Op, Basic)
 DEFINE_TEST_FP_FP16(fmov, 1Op, Basic)
 DEFINE_TEST_FP_FP16(fneg, 1Op, Basic)
 DEFINE_TEST_FP_FP16(fsqrt, 1Op, Basic)
 DEFINE_TEST_FP(frint32x, 1Op, Conversions)
 DEFINE_TEST_FP(frint64x, 1Op, Conversions)
 DEFINE_TEST_FP(frint32z, 1Op, Conversions)
 DEFINE_TEST_FP(frint64z, 1Op, Conversions)
 DEFINE_TEST_FP_FP16(frinta, 1Op, Conversions)
 DEFINE_TEST_FP_FP16(frinti, 1Op, Conversions)
 DEFINE_TEST_FP_FP16(frintm, 1Op, Conversions)
 DEFINE_TEST_FP_FP16(frintn, 1Op, Conversions)
 DEFINE_TEST_FP_FP16(frintp, 1Op, Conversions)
 DEFINE_TEST_FP_FP16(frintx, 1Op, Conversions)
 DEFINE_TEST_FP_FP16(frintz, 1Op, Conversions)

 TEST(fcmp_d) { CALL_TEST_FP_HELPER(fcmp, d, Cmp, kInputDoubleBasic); }
 TEST(fcmp_s) { CALL_TEST_FP_HELPER(fcmp, s, Cmp, kInputFloatBasic); }
 TEST(fcmp_dz) { CALL_TEST_FP_HELPER(fcmp, dz, CmpZero, kInputDoubleBasic); }
 TEST(fcmp_sz) { CALL_TEST_FP_HELPER(fcmp, sz, CmpZero, kInputFloatBasic); }

 TEST(fcvt_sd) { CALL_TEST_FP_HELPER(fcvt, sd, 1Op, kInputDoubleConversions); }
 TEST(fcvt_ds) { CALL_TEST_FP_HELPER(fcvt, ds, 1Op, kInputFloatConversions); }

 #define DEFINE_TEST_FP_TO_INT(mnemonic, type, input)               \
   TEST(mnemonic##_xd) {                                            \
     CALL_TEST_FP_HELPER(mnemonic, xd, type, kInputDouble##input);  \
   }                                                                \
   TEST(mnemonic##_xs) {                                            \
     CALL_TEST_FP_HELPER(mnemonic, xs, type, kInputFloat##input);   \
   }                                                                \
   TEST(mnemonic##_xh) {                                            \
     CALL_TEST_FP_HELPER(mnemonic, xh, type, kInputFloat16##input); \
   }                                                                \
   TEST(mnemonic##_wd) {                                            \
     CALL_TEST_FP_HELPER(mnemonic, wd, type, kInputDouble##input);  \
   }                                                                \
   TEST(mnemonic##_ws) {                                            \
     CALL_TEST_FP_HELPER(mnemonic, ws, type, kInputFloat##input);   \
   }                                                                \
   TEST(mnemonic##_wh) {                                            \
     CALL_TEST_FP_HELPER(mnemonic, wh, type, kInputFloat16##input); \
   }

 DEFINE_TEST_FP_TO_INT(fcvtas, FPToS, Conversions)
 DEFINE_TEST_FP_TO_INT(fcvtau, FPToU, Conversions)
 DEFINE_TEST_FP_TO_INT(fcvtms, FPToS, Conversions)
 DEFINE_TEST_FP_TO_INT(fcvtmu, FPToU, Conversions)
 DEFINE_TEST_FP_TO_INT(fcvtns, FPToS, Conversions)
 DEFINE_TEST_FP_TO_INT(fcvtnu, FPToU, Conversions)
 DEFINE_TEST_FP_TO_INT(fcvtzs, FPToFixedS, Conversions)
 DEFINE_TEST_FP_TO_INT(fcvtzu, FPToFixedU, Conversions)

 #define DEFINE_TEST_FP_TO_JS_INT(mnemonic, type, input)           \
   TEST(mnemonic##_wd) {                                           \
     CALL_TEST_FP_HELPER(mnemonic, wd, type, kInputDouble##input); \
   }

 DEFINE_TEST_FP_TO_JS_INT(fjcvtzs, FPToS, Conversions)

 // TODO: Scvtf-fixed-point
 // TODO: Scvtf-integer
 // TODO: Ucvtf-fixed-point
 // TODO: Ucvtf-integer

 // TODO: Fccmp
 // TODO: Fcsel


 // ==== NEON Tests. ====

 #define CALL_TEST_NEON_HELPER_1Op(mnemonic, vdform, vnform, input_n) \
   Test1OpNEON(STRINGIFY(mnemonic) "_" STRINGIFY(vdform),             \
               &MacroAssembler::mnemonic,                             \
               input_n,                                               \
               (sizeof(input_n) / sizeof(input_n[0])),                \
               kExpected_NEON_##mnemonic##_##vdform,                  \
               kExpectedCount_NEON_##mnemonic##_##vdform,             \
               kFormat##vdform,                                       \
               kFormat##vnform)

 #define CALL_TEST_NEON_HELPER_1OpAcross(mnemonic, vdform, vnform, input_n)   \
   Test1OpAcrossNEON(STRINGIFY(mnemonic) "_" STRINGIFY(vdform) "_" STRINGIFY( \
                         vnform),                                             \
                     &MacroAssembler::mnemonic,                               \
                     input_n,                                                 \
                     (sizeof(input_n) / sizeof(input_n[0])),                  \
                     kExpected_NEON_##mnemonic##_##vdform##_##vnform,         \
                     kExpectedCount_NEON_##mnemonic##_##vdform##_##vnform,    \
                     kFormat##vdform,                                         \
                     kFormat##vnform)

 #define CALL_TEST_NEON_HELPER_2Op(mnemonic,              \
                                   vdform,                \
                                   vnform,                \
                                   vmform,                \
                                   input_d,               \
                                   input_n,               \
                                   input_m)               \
   Test2OpNEON(STRINGIFY(mnemonic) "_" STRINGIFY(vdform), \
               &MacroAssembler::mnemonic,                 \
               input_d,                                   \
               input_n,                                   \
               (sizeof(input_n) / sizeof(input_n[0])),    \
               input_m,                                   \
               (sizeof(input_m) / sizeof(input_m[0])),    \
               kExpected_NEON_##mnemonic##_##vdform,      \
               kExpectedCount_NEON_##mnemonic##_##vdform, \
               kFormat##vdform,                           \
               kFormat##vnform,                           \
               kFormat##vmform)

 #define CALL_TEST_NEON_HELPER_2OpImm(mnemonic,                        \
                                      vdform,                          \
                                      vnform,                          \
                                      input_n,                         \
                                      input_m)                         \
   Test2OpImmNEON(STRINGIFY(mnemonic) "_" STRINGIFY(vdform) "_2OPIMM", \
                  &MacroAssembler::mnemonic,                           \
                  input_n,                                             \
                  (sizeof(input_n) / sizeof(input_n[0])),              \
                  input_m,                                             \
                  (sizeof(input_m) / sizeof(input_m[0])),              \
                  kExpected_NEON_##mnemonic##_##vdform##_2OPIMM,       \
                  kExpectedCount_NEON_##mnemonic##_##vdform##_2OPIMM,  \
                  kFormat##vdform,                                     \
                  kFormat##vnform)

 #define CALL_TEST_NEON_HELPER_ByElement(mnemonic,                      \
                                         vdform,                        \
                                         vnform,                        \
                                         vmform,                        \
                                         input_d,                       \
                                         input_n,                       \
                                         input_m,                       \
                                         indices)                       \
   TestByElementNEON(                                                   \
       STRINGIFY(mnemonic) "_" STRINGIFY(vdform) "_" STRINGIFY(         \
           vnform) "_" STRINGIFY(vmform),                               \
       &MacroAssembler::mnemonic,                                       \
       input_d,                                                         \
       input_n,                                                         \
       (sizeof(input_n) / sizeof(input_n[0])),                          \
       input_m,                                                         \
       (sizeof(input_m) / sizeof(input_m[0])),                          \
       indices,                                                         \
       (sizeof(indices) / sizeof(indices[0])),                          \
       kExpected_NEON_##mnemonic##_##vdform##_##vnform##_##vmform,      \
       kExpectedCount_NEON_##mnemonic##_##vdform##_##vnform##_##vmform, \
       kFormat##vdform,                                                 \
       kFormat##vnform,                                                 \
       kFormat##vmform)

 #define CALL_TEST_NEON_HELPER_ByElement_Dot_Product(mnemonic,           \
                                                     vdform,             \
                                                     vnform,             \
                                                     vmform,             \
                                                     input_d,            \
                                                     input_n,            \
                                                     input_m,            \
                                                     indices,            \
                                                     vm_subvector_count) \
   TestByElementNEON(                                                    \
       STRINGIFY(mnemonic) "_" STRINGIFY(vdform) "_" STRINGIFY(          \
           vnform) "_" STRINGIFY(vmform),                                \
       &MacroAssembler::mnemonic,                                        \
       input_d,                                                          \
       input_n,                                                          \
       (sizeof(input_n) / sizeof(input_n[0])),                           \
       input_m,                                                          \
       (sizeof(input_m) / sizeof(input_m[0])),                           \
       indices,                                                          \
       (sizeof(indices) / sizeof(indices[0])),                           \
       kExpected_NEON_##mnemonic##_##vdform##_##vnform##_##vmform,       \
       kExpectedCount_NEON_##mnemonic##_##vdform##_##vnform##_##vmform,  \
       kFormat##vdform,                                                  \
       kFormat##vnform,                                                  \
       kFormat##vmform,                                                  \
       vm_subvector_count)

 #define CALL_TEST_NEON_HELPER_OpImmOpImm(helper,                   \
                                          mnemonic,                 \
                                          vdform,                   \
                                          vnform,                   \
                                          input_d,                  \
                                          input_imm1,               \
                                          input_n,                  \
                                          input_imm2)               \
   TestOpImmOpImmNEON(STRINGIFY(mnemonic) "_" STRINGIFY(vdform),    \
                      helper,                                       \
                      input_d,                                      \
                      input_imm1,                                   \
                      (sizeof(input_imm1) / sizeof(input_imm1[0])), \
                      input_n,                                      \
                      (sizeof(input_n) / sizeof(input_n[0])),       \
                      input_imm2,                                   \
                      (sizeof(input_imm2) / sizeof(input_imm2[0])), \
                      kExpected_NEON_##mnemonic##_##vdform,         \
                      kExpectedCount_NEON_##mnemonic##_##vdform,    \
                      kFormat##vdform,                              \
                      kFormat##vnform)

 #define CALL_TEST_NEON_HELPER_2SAME(mnemonic, variant, input) \
   CALL_TEST_NEON_HELPER_1Op(mnemonic, variant, variant, input)

 #define DEFINE_TEST_NEON_2SAME_8B_16B(mnemonic, input)              \
   TEST(mnemonic##_8B) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 8B, kInput8bits##input);  \
   }                                                                 \
   TEST(mnemonic##_16B) {                                            \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 16B, kInput8bits##input); \
   }

 #define DEFINE_TEST_NEON_2SAME_4H_8H(mnemonic, input)               \
   TEST(mnemonic##_4H) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 4H, kInput16bits##input); \
   }                                                                 \
   TEST(mnemonic##_8H) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 8H, kInput16bits##input); \
   }

 #define DEFINE_TEST_NEON_2SAME_2S_4S(mnemonic, input)               \
   TEST(mnemonic##_2S) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 2S, kInput32bits##input); \
   }                                                                 \
   TEST(mnemonic##_4S) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 4S, kInput32bits##input); \
   }

 #define DEFINE_TEST_NEON_2SAME_BH(mnemonic, input) \
   DEFINE_TEST_NEON_2SAME_8B_16B(mnemonic, input)   \
   DEFINE_TEST_NEON_2SAME_4H_8H(mnemonic, input)

 #define DEFINE_TEST_NEON_2SAME_NO2D(mnemonic, input) \
   DEFINE_TEST_NEON_2SAME_BH(mnemonic, input)         \
   DEFINE_TEST_NEON_2SAME_2S_4S(mnemonic, input)

 #define DEFINE_TEST_NEON_2SAME(mnemonic, input)                     \
   DEFINE_TEST_NEON_2SAME_NO2D(mnemonic, input)                      \
   TEST(mnemonic##_2D) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 2D, kInput64bits##input); \
   }
 #define DEFINE_TEST_NEON_2SAME_SD(mnemonic, input)                  \
   DEFINE_TEST_NEON_2SAME_2S_4S(mnemonic, input)                     \
   TEST(mnemonic##_2D) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 2D, kInput64bits##input); \
   }

 #define DEFINE_TEST_NEON_2SAME_FP(mnemonic, input)                  \
   TEST(mnemonic##_2S) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 2S, kInputFloat##input);  \
   }                                                                 \
   TEST(mnemonic##_4S) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 4S, kInputFloat##input);  \
   }                                                                 \
   TEST(mnemonic##_2D) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 2D, kInputDouble##input); \
   }

 #define DEFINE_TEST_NEON_2SAME_FP_FP16(mnemonic, input)              \
   DEFINE_TEST_NEON_2SAME_FP(mnemonic, input)                         \
   TEST(mnemonic##_4H) {                                              \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 4H, kInputFloat16##input); \
   }                                                                  \
   TEST(mnemonic##_8H) {                                              \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, 8H, kInputFloat16##input); \
   }

 #define DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(mnemonic, input)      \
   TEST(mnemonic##_H) {                                              \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, H, kInputFloat16##input); \
   }                                                                 \
   TEST(mnemonic##_S) {                                              \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, S, kInputFloat##input);   \
   }                                                                 \
   TEST(mnemonic##_D) {                                              \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, D, kInputDouble##input);  \
   }

 #define DEFINE_TEST_NEON_2SAME_SCALAR_B(mnemonic, input)          \
   TEST(mnemonic##_B) {                                            \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, B, kInput8bits##input); \
   }
 #define DEFINE_TEST_NEON_2SAME_SCALAR_H(mnemonic, input)           \
   TEST(mnemonic##_H) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, H, kInput16bits##input); \
   }
 #define DEFINE_TEST_NEON_2SAME_SCALAR_S(mnemonic, input)           \
   TEST(mnemonic##_S) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, S, kInput32bits##input); \
   }
 #define DEFINE_TEST_NEON_2SAME_SCALAR_D(mnemonic, input)           \
   TEST(mnemonic##_D) {                                             \
     CALL_TEST_NEON_HELPER_2SAME(mnemonic, D, kInput64bits##input); \
   }

 #define DEFINE_TEST_NEON_2SAME_SCALAR(mnemonic, input) \
   DEFINE_TEST_NEON_2SAME_SCALAR_B(mnemonic, input)     \
   DEFINE_TEST_NEON_2SAME_SCALAR_H(mnemonic, input)     \
   DEFINE_TEST_NEON_2SAME_SCALAR_S(mnemonic, input)     \
   DEFINE_TEST_NEON_2SAME_SCALAR_D(mnemonic, input)

 #define DEFINE_TEST_NEON_2SAME_SCALAR_SD(mnemonic, input) \
   DEFINE_TEST_NEON_2SAME_SCALAR_S(mnemonic, input)        \
   DEFINE_TEST_NEON_2SAME_SCALAR_D(mnemonic, input)


 #define CALL_TEST_NEON_HELPER_ACROSS(mnemonic, vd_form, vn_form, input_n) \
   CALL_TEST_NEON_HELPER_1OpAcross(mnemonic, vd_form, vn_form, input_n)

 #define DEFINE_TEST_NEON_ACROSS(mnemonic, input)                        \
   TEST(mnemonic##_B_8B) {                                               \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, B, 8B, kInput8bits##input);  \
   }                                                                     \
   TEST(mnemonic##_B_16B) {                                              \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, B, 16B, kInput8bits##input); \
   }                                                                     \
   TEST(mnemonic##_H_4H) {                                               \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, H, 4H, kInput16bits##input); \
   }                                                                     \
   TEST(mnemonic##_H_8H) {                                               \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, H, 8H, kInput16bits##input); \
   }                                                                     \
   TEST(mnemonic##_S_4S) {                                               \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, S, 4S, kInput32bits##input); \
   }

 #define DEFINE_TEST_NEON_ACROSS_LONG(mnemonic, input)                   \
   TEST(mnemonic##_H_8B) {                                               \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, H, 8B, kInput8bits##input);  \
   }                                                                     \
   TEST(mnemonic##_H_16B) {                                              \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, H, 16B, kInput8bits##input); \
   }                                                                     \
   TEST(mnemonic##_S_4H) {                                               \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, S, 4H, kInput16bits##input); \
   }                                                                     \
   TEST(mnemonic##_S_8H) {                                               \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, S, 8H, kInput16bits##input); \
   }                                                                     \
   TEST(mnemonic##_D_4S) {                                               \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, D, 4S, kInput32bits##input); \
   }

 #define DEFINE_TEST_NEON_ACROSS_FP(mnemonic, input)                      \
   TEST(mnemonic##_H_4H) {                                                \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, H, 4H, kInputFloat16##input); \
   }                                                                      \
   TEST(mnemonic##_H_8H) {                                                \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, H, 8H, kInputFloat16##input); \
   }                                                                      \
   TEST(mnemonic##_S_4S) {                                                \
     CALL_TEST_NEON_HELPER_ACROSS(mnemonic, S, 4S, kInputFloat##input);   \
   }

 #define CALL_TEST_NEON_HELPER_2DIFF(mnemonic, vdform, vnform, input_n) \
   CALL_TEST_NEON_HELPER_1Op(mnemonic, vdform, vnform, input_n)

 #define DEFINE_TEST_NEON_2DIFF_LONG(mnemonic, input)                    \
   TEST(mnemonic##_4H) {                                                 \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 4H, 8B, kInput8bits##input);  \
   }                                                                     \
   TEST(mnemonic##_8H) {                                                 \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 8H, 16B, kInput8bits##input); \
   }                                                                     \
   TEST(mnemonic##_2S) {                                                 \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 2S, 4H, kInput16bits##input); \
   }                                                                     \
   TEST(mnemonic##_4S) {                                                 \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 4S, 8H, kInput16bits##input); \
   }                                                                     \
   TEST(mnemonic##_1D) {                                                 \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 1D, 2S, kInput32bits##input); \
   }                                                                     \
   TEST(mnemonic##_2D) {                                                 \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 2D, 4S, kInput32bits##input); \
   }

 #define DEFINE_TEST_NEON_2DIFF_NARROW(mnemonic, input)                      \
   TEST(mnemonic##_8B) {                                                     \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 8B, 8H, kInput16bits##input);     \
   }                                                                         \
   TEST(mnemonic##_4H) {                                                     \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 4H, 4S, kInput32bits##input);     \
   }                                                                         \
   TEST(mnemonic##_2S) {                                                     \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 2S, 2D, kInput64bits##input);     \
   }                                                                         \
   TEST(mnemonic##2_16B) {                                                   \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic##2, 16B, 8H, kInput16bits##input); \
   }                                                                         \
   TEST(mnemonic##2_8H) {                                                    \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic##2, 8H, 4S, kInput32bits##input);  \
   }                                                                         \
   TEST(mnemonic##2_4S) {                                                    \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic##2, 4S, 2D, kInput64bits##input);  \
   }

 #define DEFINE_TEST_NEON_2DIFF_FP_LONG(mnemonic, input)                     \
   TEST(mnemonic##_4S) {                                                     \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 4S, 4H, kInputFloat16##input);    \
   }                                                                         \
   TEST(mnemonic##_2D) {                                                     \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 2D, 2S, kInputFloat##input);      \
   }                                                                         \
   TEST(mnemonic##2_4S) {                                                    \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic##2, 4S, 8H, kInputFloat16##input); \
   }                                                                         \
   TEST(mnemonic##2_2D) {                                                    \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic##2, 2D, 4S, kInputFloat##input);   \
   }

 #define DEFINE_TEST_NEON_2DIFF_FP_NARROW(mnemonic, input)                  \
   TEST(mnemonic##_4H) {                                                    \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 4H, 4S, kInputFloat##input);     \
   }                                                                        \
   TEST(mnemonic##_2S) {                                                    \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 2S, 2D, kInputDouble##input);    \
   }                                                                        \
   TEST(mnemonic##2_8H) {                                                   \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic##2, 8H, 4S, kInputFloat##input);  \
   }                                                                        \
   TEST(mnemonic##2_4S) {                                                   \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic##2, 4S, 2D, kInputDouble##input); \
   }

 #define DEFINE_TEST_NEON_2DIFF_FP_NARROW_2S(mnemonic, input)               \
   TEST(mnemonic##_2S) {                                                    \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, 2S, 2D, kInputDouble##input);    \
   }                                                                        \
   TEST(mnemonic##2_4S) {                                                   \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic##2, 4S, 2D, kInputDouble##input); \
   }

 #define DEFINE_TEST_NEON_2DIFF_SCALAR_NARROW(mnemonic, input)         \
   TEST(mnemonic##_B) {                                                \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, B, H, kInput16bits##input); \
   }                                                                   \
   TEST(mnemonic##_H) {                                                \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, H, S, kInput32bits##input); \
   }                                                                   \
   TEST(mnemonic##_S) {                                                \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, S, D, kInput64bits##input); \
   }

 #define DEFINE_TEST_NEON_2DIFF_FP_SCALAR_SD(mnemonic, input)            \
   TEST(mnemonic##_S) {                                                  \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, S, 2S, kInputFloat##input);   \
   }                                                                     \
   TEST(mnemonic##_D) {                                                  \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, D, 2D, kInputDouble##input);  \
   }                                                                     \
   TEST(mnemonic##_H) {                                                  \
     CALL_TEST_NEON_HELPER_2DIFF(mnemonic, H, 2H, kInputFloat16##input); \
   }

 #define CALL_TEST_NEON_HELPER_3SAME(mnemonic, variant, input_d, input_nm) \
   {                                                                       \
     CALL_TEST_NEON_HELPER_2Op(mnemonic,                                   \
                               variant,                                    \
                               variant,                                    \
                               variant,                                    \
                               input_d,                                    \
                               input_nm,                                   \
                               input_nm);                                  \
   }

 #define DEFINE_TEST_NEON_3SAME_8B_16B(mnemonic, input)     \
   TEST(mnemonic##_8B) {                                    \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                  \
                                 8B,                        \
                                 kInput8bitsAccDestination, \
                                 kInput8bits##input);       \
   }                                                        \
   TEST(mnemonic##_16B) {                                   \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                  \
                                 16B,                       \
                                 kInput8bitsAccDestination, \
                                 kInput8bits##input);       \
   }

 #define DEFINE_TEST_NEON_3SAME_HS(mnemonic, input)          \
   TEST(mnemonic##_4H) {                                     \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 4H,                         \
                                 kInput16bitsAccDestination, \
                                 kInput16bits##input);       \
   }                                                         \
   TEST(mnemonic##_8H) {                                     \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 8H,                         \
                                 kInput16bitsAccDestination, \
                                 kInput16bits##input);       \
   }                                                         \
   TEST(mnemonic##_2S) {                                     \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 2S,                         \
                                 kInput32bitsAccDestination, \
                                 kInput32bits##input);       \
   }                                                         \
   TEST(mnemonic##_4S) {                                     \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 4S,                         \
                                 kInput32bitsAccDestination, \
                                 kInput32bits##input);       \
   }

 #define DEFINE_TEST_NEON_3SAME_NO2D(mnemonic, input) \
   DEFINE_TEST_NEON_3SAME_8B_16B(mnemonic, input)     \
   DEFINE_TEST_NEON_3SAME_HS(mnemonic, input)

 #define DEFINE_TEST_NEON_3SAME(mnemonic, input)             \
   DEFINE_TEST_NEON_3SAME_NO2D(mnemonic, input)              \
   TEST(mnemonic##_2D) {                                     \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 2D,                         \
                                 kInput64bitsAccDestination, \
                                 kInput64bits##input);       \
   }

 #define DEFINE_TEST_NEON_3SAME_FP(mnemonic, input)           \
   TEST(mnemonic##_4H) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                    \
                                 4H,                          \
                                 kInputFloat16AccDestination, \
                                 kInputFloat16##input);       \
   }                                                          \
   TEST(mnemonic##_8H) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                    \
                                 8H,                          \
                                 kInputFloat16AccDestination, \
                                 kInputFloat16##input);       \
   }                                                          \
   TEST(mnemonic##_2S) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                    \
                                 2S,                          \
                                 kInputFloatAccDestination,   \
                                 kInputFloat##input);         \
   }                                                          \
   TEST(mnemonic##_4S) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                    \
                                 4S,                          \
                                 kInputFloatAccDestination,   \
                                 kInputFloat##input);         \
   }                                                          \
   TEST(mnemonic##_2D) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                    \
                                 2D,                          \
                                 kInputDoubleAccDestination,  \
                                 kInputDouble##input);        \
   }

 #define DEFINE_TEST_NEON_3SAME_SCALAR_D(mnemonic, input)    \
   TEST(mnemonic##_D) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 D,                          \
                                 kInput64bitsAccDestination, \
                                 kInput64bits##input);       \
   }

 #define DEFINE_TEST_NEON_3SAME_SCALAR_HS(mnemonic, input)   \
   TEST(mnemonic##_H) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 H,                          \
                                 kInput16bitsAccDestination, \
                                 kInput16bits##input);       \
   }                                                         \
   TEST(mnemonic##_S) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 S,                          \
                                 kInput32bitsAccDestination, \
                                 kInput32bits##input);       \
   }

 #define DEFINE_TEST_NEON_3SAME_SCALAR(mnemonic, input)      \
   TEST(mnemonic##_B) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 B,                          \
                                 kInput8bitsAccDestination,  \
                                 kInput8bits##input);        \
   }                                                         \
   TEST(mnemonic##_H) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 H,                          \
                                 kInput16bitsAccDestination, \
                                 kInput16bits##input);       \
   }                                                         \
   TEST(mnemonic##_S) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 S,                          \
                                 kInput32bitsAccDestination, \
                                 kInput32bits##input);       \
   }                                                         \
   TEST(mnemonic##_D) {                                      \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                   \
                                 D,                          \
                                 kInput64bitsAccDestination, \
                                 kInput64bits##input);       \
   }

 #define DEFINE_TEST_NEON_3SAME_FP_SCALAR(mnemonic, input)    \
   TEST(mnemonic##_H) {                                       \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                    \
                                 H,                           \
                                 kInputFloat16AccDestination, \
                                 kInputFloat16##input);       \
   }                                                          \
   TEST(mnemonic##_S) {                                       \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                    \
                                 S,                           \
                                 kInputFloatAccDestination,   \
                                 kInputFloat##input);         \
   }                                                          \
   TEST(mnemonic##_D) {                                       \
     CALL_TEST_NEON_HELPER_3SAME(mnemonic,                    \
                                 D,                           \
                                 kInputDoubleAccDestination,  \
                                 kInputDouble##input);        \
   }

 #define DEFINE_TEST_NEON_FHM(mnemonic, input_d, input_n, input_m) \
   TEST(mnemonic##_2S) {                                           \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                         \
                                 2S,                               \
                                 2H,                               \
                                 2H,                               \
                                 kInputFloatAccDestination,        \
                                 kInputFloat16##input_n,           \
                                 kInputFloat16##input_m);          \
   }                                                               \
   TEST(mnemonic##_4S) {                                           \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                         \
                                 4S,                               \
                                 4H,                               \
                                 4H,                               \
                                 kInputFloatAccDestination,        \
                                 kInputFloat16##input_n,           \
                                 kInputFloat16##input_m);          \
   }

 #define CALL_TEST_NEON_HELPER_3DIFF(mnemonic, \
                                     vdform,   \
                                     vnform,   \
                                     vmform,   \
                                     input_d,  \
                                     input_n,  \
                                     input_m)  \
   {                                           \
     CALL_TEST_NEON_HELPER_2Op(mnemonic,       \
                               vdform,         \
                               vnform,         \
                               vmform,         \
                               input_d,        \
                               input_n,        \
                               input_m);       \
   }

 #define DEFINE_TEST_NEON_3DIFF_LONG_8H(mnemonic, input)     \
   TEST(mnemonic##_8H) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 8H,                         \
                                 8B,                         \
                                 8B,                         \
                                 kInput16bitsAccDestination, \
                                 kInput8bits##input,         \
                                 kInput8bits##input);        \
   }                                                         \
   TEST(mnemonic##2_8H) {                                    \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic##2,                \
                                 8H,                         \
                                 16B,                        \
                                 16B,                        \
                                 kInput16bitsAccDestination, \
                                 kInput8bits##input,         \
                                 kInput8bits##input);        \
   }

 #define DEFINE_TEST_NEON_3DIFF_LONG_4S(mnemonic, input)     \
   TEST(mnemonic##_4S) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 4S,                         \
                                 4H,                         \
                                 4H,                         \
                                 kInput32bitsAccDestination, \
                                 kInput16bits##input,        \
                                 kInput16bits##input);       \
   }                                                         \
   TEST(mnemonic##2_4S) {                                    \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic##2,                \
                                 4S,                         \
                                 8H,                         \
                                 8H,                         \
                                 kInput32bitsAccDestination, \
                                 kInput16bits##input,        \
                                 kInput16bits##input);       \
   }

 #define DEFINE_TEST_NEON_3DIFF_LONG_2D(mnemonic, input)     \
   TEST(mnemonic##_2D) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 2D,                         \
                                 2S,                         \
                                 2S,                         \
                                 kInput64bitsAccDestination, \
                                 kInput32bits##input,        \
                                 kInput32bits##input);       \
   }                                                         \
   TEST(mnemonic##2_2D) {                                    \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic##2,                \
                                 2D,                         \
                                 4S,                         \
                                 4S,                         \
                                 kInput64bitsAccDestination, \
                                 kInput32bits##input,        \
                                 kInput32bits##input);       \
   }

 #define DEFINE_TEST_NEON_3DIFF_LONG_SD(mnemonic, input) \
   DEFINE_TEST_NEON_3DIFF_LONG_4S(mnemonic, input)       \
   DEFINE_TEST_NEON_3DIFF_LONG_2D(mnemonic, input)

 #define DEFINE_TEST_NEON_3DIFF_LONG(mnemonic, input) \
   DEFINE_TEST_NEON_3DIFF_LONG_8H(mnemonic, input)    \
   DEFINE_TEST_NEON_3DIFF_LONG_4S(mnemonic, input)    \
   DEFINE_TEST_NEON_3DIFF_LONG_2D(mnemonic, input)

 #define DEFINE_TEST_NEON_3DIFF_SCALAR_LONG_S(mnemonic, input) \
   TEST(mnemonic##_S) {                                        \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                     \
                                 S,                            \
                                 H,                            \
                                 H,                            \
                                 kInput32bitsAccDestination,   \
                                 kInput16bits##input,          \
                                 kInput16bits##input);         \
   }

 #define DEFINE_TEST_NEON_3DIFF_SCALAR_LONG_D(mnemonic, input) \
   TEST(mnemonic##_D) {                                        \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                     \
                                 D,                            \
                                 S,                            \
                                 S,                            \
                                 kInput64bitsAccDestination,   \
                                 kInput32bits##input,          \
                                 kInput32bits##input);         \
   }

 #define DEFINE_TEST_NEON_3DIFF_SCALAR_LONG_SD(mnemonic, input) \
   DEFINE_TEST_NEON_3DIFF_SCALAR_LONG_S(mnemonic, input)        \
   DEFINE_TEST_NEON_3DIFF_SCALAR_LONG_D(mnemonic, input)

 #define DEFINE_TEST_NEON_3DIFF_WIDE(mnemonic, input)        \
   TEST(mnemonic##_8H) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 8H,                         \
                                 8H,                         \
                                 8B,                         \
                                 kInput16bitsAccDestination, \
                                 kInput16bits##input,        \
                                 kInput8bits##input);        \
   }                                                         \
   TEST(mnemonic##_4S) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 4S,                         \
                                 4S,                         \
                                 4H,                         \
                                 kInput32bitsAccDestination, \
                                 kInput32bits##input,        \
                                 kInput16bits##input);       \
   }                                                         \
   TEST(mnemonic##_2D) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 2D,                         \
                                 2D,                         \
                                 2S,                         \
                                 kInput64bitsAccDestination, \
                                 kInput64bits##input,        \
                                 kInput32bits##input);       \
   }                                                         \
   TEST(mnemonic##2_8H) {                                    \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic##2,                \
                                 8H,                         \
                                 8H,                         \
                                 16B,                        \
                                 kInput16bitsAccDestination, \
                                 kInput16bits##input,        \
                                 kInput8bits##input);        \
   }                                                         \
   TEST(mnemonic##2_4S) {                                    \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic##2,                \
                                 4S,                         \
                                 4S,                         \
                                 8H,                         \
                                 kInput32bitsAccDestination, \
                                 kInput32bits##input,        \
                                 kInput16bits##input);       \
   }                                                         \
   TEST(mnemonic##2_2D) {                                    \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic##2,                \
                                 2D,                         \
                                 2D,                         \
                                 4S,                         \
                                 kInput64bitsAccDestination, \
                                 kInput64bits##input,        \
                                 kInput32bits##input);       \
   }

 #define DEFINE_TEST_NEON_3DIFF_NARROW(mnemonic, input)      \
   TEST(mnemonic##_8B) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 8B,                         \
                                 8H,                         \
                                 8H,                         \
                                 kInput8bitsAccDestination,  \
                                 kInput16bits##input,        \
                                 kInput16bits##input);       \
   }                                                         \
   TEST(mnemonic##_4H) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 4H,                         \
                                 4S,                         \
                                 4S,                         \
                                 kInput16bitsAccDestination, \
                                 kInput32bits##input,        \
                                 kInput32bits##input);       \
   }                                                         \
   TEST(mnemonic##_2S) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 2S,                         \
                                 2D,                         \
                                 2D,                         \
                                 kInput32bitsAccDestination, \
                                 kInput64bits##input,        \
                                 kInput64bits##input);       \
   }                                                         \
   TEST(mnemonic##2_16B) {                                   \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic##2,                \
                                 16B,                        \
                                 8H,                         \
                                 8H,                         \
                                 kInput8bitsAccDestination,  \
                                 kInput16bits##input,        \
                                 kInput16bits##input);       \
   }                                                         \
   TEST(mnemonic##2_8H) {                                    \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic##2,                \
                                 8H,                         \
                                 4S,                         \
                                 4S,                         \
                                 kInput16bitsAccDestination, \
                                 kInput32bits##input,        \
                                 kInput32bits##input);       \
   }                                                         \
   TEST(mnemonic##2_4S) {                                    \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic##2,                \
                                 4S,                         \
                                 2D,                         \
                                 2D,                         \
                                 kInput32bitsAccDestination, \
                                 kInput64bits##input,        \
                                 kInput64bits##input);       \
   }

 #define DEFINE_TEST_NEON_3DIFF_DOUBLE_WIDE(mnemonic, input) \
   TEST(mnemonic##_2S) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 2S,                         \
                                 8B,                         \
                                 8B,                         \
                                 kInput32bitsAccDestination, \
                                 kInput8bits##input,         \
                                 kInput8bits##input);        \
   }                                                         \
   TEST(mnemonic##_4S) {                                     \
     CALL_TEST_NEON_HELPER_3DIFF(mnemonic,                   \
                                 4S,                         \
                                 16B,                        \
                                 16B,                        \
                                 kInput32bitsAccDestination, \
                                 kInput8bits##input,         \
                                 kInput8bits##input);        \
   }


 #define CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,  \
                                      vdform,    \
                                      vnform,    \
                                      input_n,   \
                                      input_imm) \
   {                                             \
     CALL_TEST_NEON_HELPER_2OpImm(mnemonic,      \
                                  vdform,        \
                                  vnform,        \
                                  input_n,       \
                                  input_imm);    \
   }

 #define DEFINE_TEST_NEON_2OPIMM(mnemonic, input, input_imm)   \
   TEST(mnemonic##_8B_2OPIMM) {                                \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                    \
                                  8B,                          \
                                  8B,                          \
                                  kInput8bits##input,          \
                                  kInput8bitsImm##input_imm);  \
   }                                                           \
   TEST(mnemonic##_16B_2OPIMM) {                               \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                    \
                                  16B,                         \
                                  16B,                         \
                                  kInput8bits##input,          \
                                  kInput8bitsImm##input_imm);  \
   }                                                           \
   TEST(mnemonic##_4H_2OPIMM) {                                \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                    \
                                  4H,                          \
                                  4H,                          \
                                  kInput16bits##input,         \
                                  kInput16bitsImm##input_imm); \
   }                                                           \
   TEST(mnemonic##_8H_2OPIMM) {                                \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                    \
                                  8H,                          \
                                  8H,                          \
                                  kInput16bits##input,         \
                                  kInput16bitsImm##input_imm); \
   }                                                           \
   TEST(mnemonic##_2S_2OPIMM) {                                \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                    \
                                  2S,                          \
                                  2S,                          \
                                  kInput32bits##input,         \
                                  kInput32bitsImm##input_imm); \
   }                                                           \
   TEST(mnemonic##_4S_2OPIMM) {                                \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                    \
                                  4S,                          \
                                  4S,                          \
                                  kInput32bits##input,         \
                                  kInput32bitsImm##input_imm); \
   }                                                           \
   TEST(mnemonic##_2D_2OPIMM) {                                \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                    \
                                  2D,                          \
                                  2D,                          \
                                  kInput64bits##input,         \
                                  kInput64bitsImm##input_imm); \
   }

 #define DEFINE_TEST_NEON_2OPIMM_COPY(mnemonic, input, input_imm) \
   TEST(mnemonic##_8B_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  8B,                             \
                                  B,                              \
                                  kInput8bits##input,             \
                                  kInput8bitsImm##input_imm);     \
   }                                                              \
   TEST(mnemonic##_16B_2OPIMM) {                                  \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  16B,                            \
                                  B,                              \
                                  kInput8bits##input,             \
                                  kInput8bitsImm##input_imm);     \
   }                                                              \
   TEST(mnemonic##_4H_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  4H,                             \
                                  H,                              \
                                  kInput16bits##input,            \
                                  kInput16bitsImm##input_imm);    \
   }                                                              \
   TEST(mnemonic##_8H_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  8H,                             \
                                  H,                              \
                                  kInput16bits##input,            \
                                  kInput16bitsImm##input_imm);    \
   }                                                              \
   TEST(mnemonic##_2S_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  2S,                             \
                                  S,                              \
                                  kInput32bits##input,            \
                                  kInput32bitsImm##input_imm);    \
   }                                                              \
   TEST(mnemonic##_4S_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  4S,                             \
                                  S,                              \
                                  kInput32bits##input,            \
                                  kInput32bitsImm##input_imm);    \
   }                                                              \
   TEST(mnemonic##_2D_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  2D,                             \
                                  D,                              \
                                  kInput64bits##input,            \
                                  kInput64bitsImm##input_imm);    \
   }

 #define DEFINE_TEST_NEON_2OPIMM_NARROW(mnemonic, input, input_imm) \
   TEST(mnemonic##_8B_2OPIMM) {                                     \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                         \
                                  8B,                               \
                                  8H,                               \
                                  kInput16bits##input,              \
                                  kInput8bitsImm##input_imm);       \
   }                                                                \
   TEST(mnemonic##_4H_2OPIMM) {                                     \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                         \
                                  4H,                               \
                                  4S,                               \
                                  kInput32bits##input,              \
                                  kInput16bitsImm##input_imm);      \
   }                                                                \
   TEST(mnemonic##_2S_2OPIMM) {                                     \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                         \
                                  2S,                               \
                                  2D,                               \
                                  kInput64bits##input,              \
                                  kInput32bitsImm##input_imm);      \
   }                                                                \
   TEST(mnemonic##2_16B_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic##2,                      \
                                  16B,                              \
                                  8H,                               \
                                  kInput16bits##input,              \
                                  kInput8bitsImm##input_imm);       \
   }                                                                \
   TEST(mnemonic##2_8H_2OPIMM) {                                    \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic##2,                      \
                                  8H,                               \
                                  4S,                               \
                                  kInput32bits##input,              \
                                  kInput16bitsImm##input_imm);      \
   }                                                                \
   TEST(mnemonic##2_4S_2OPIMM) {                                    \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic##2,                      \
                                  4S,                               \
                                  2D,                               \
                                  kInput64bits##input,              \
                                  kInput32bitsImm##input_imm);      \
   }

 #define DEFINE_TEST_NEON_2OPIMM_SCALAR_NARROW(mnemonic, input, input_imm) \
   TEST(mnemonic##_B_2OPIMM) {                                             \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                                \
                                  B,                                       \
                                  H,                                       \
                                  kInput16bits##input,                     \
                                  kInput8bitsImm##input_imm);              \
   }                                                                       \
   TEST(mnemonic##_H_2OPIMM) {                                             \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                                \
                                  H,                                       \
                                  S,                                       \
                                  kInput32bits##input,                     \
                                  kInput16bitsImm##input_imm);             \
   }                                                                       \
   TEST(mnemonic##_S_2OPIMM) {                                             \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                                \
                                  S,                                       \
                                  D,                                       \
                                  kInput64bits##input,                     \
                                  kInput32bitsImm##input_imm);             \
   }

 #define DEFINE_TEST_NEON_2OPIMM_FCMP_ZERO(mnemonic, input, input_imm) \
   TEST(mnemonic##_4H_2OPIMM) {                                        \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                            \
                                  4H,                                  \
                                  4H,                                  \
                                  kInputFloat16##input,                \
                                  kInputDoubleImm##input_imm);         \
   }                                                                   \
   TEST(mnemonic##_8H_2OPIMM) {                                        \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                            \
                                  8H,                                  \
                                  8H,                                  \
                                  kInputFloat16##input,                \
                                  kInputDoubleImm##input_imm);         \
   }                                                                   \
   TEST(mnemonic##_2S_2OPIMM) {                                        \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                            \
                                  2S,                                  \
                                  2S,                                  \
                                  kInputFloat##Basic,                  \
                                  kInputDoubleImm##input_imm);         \
   }                                                                   \
   TEST(mnemonic##_4S_2OPIMM) {                                        \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                            \
                                  4S,                                  \
                                  4S,                                  \
                                  kInputFloat##input,                  \
                                  kInputDoubleImm##input_imm);         \
   }                                                                   \
   TEST(mnemonic##_2D_2OPIMM) {                                        \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                            \
                                  2D,                                  \
                                  2D,                                  \
                                  kInputDouble##input,                 \
                                  kInputDoubleImm##input_imm);         \
   }

 #define DEFINE_TEST_NEON_2OPIMM_FP(mnemonic, input, input_imm) \
   TEST(mnemonic##_4H_2OPIMM) {                                 \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                     \
                                  4H,                           \
                                  4H,                           \
                                  kInputFloat16##input,         \
                                  kInput16bitsImm##input_imm);  \
   }                                                            \
   TEST(mnemonic##_8H_2OPIMM) {                                 \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                     \
                                  8H,                           \
                                  8H,                           \
                                  kInputFloat16##input,         \
                                  kInput16bitsImm##input_imm);  \
   }                                                            \
   TEST(mnemonic##_2S_2OPIMM) {                                 \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                     \
                                  2S,                           \
                                  2S,                           \
                                  kInputFloat##Basic,           \
                                  kInput32bitsImm##input_imm);  \
   }                                                            \
   TEST(mnemonic##_4S_2OPIMM) {                                 \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                     \
                                  4S,                           \
                                  4S,                           \
                                  kInputFloat##input,           \
                                  kInput32bitsImm##input_imm);  \
   }                                                            \
   TEST(mnemonic##_2D_2OPIMM) {                                 \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                     \
                                  2D,                           \
                                  2D,                           \
                                  kInputDouble##input,          \
                                  kInput64bitsImm##input_imm);  \
   }

 #define DEFINE_TEST_NEON_2OPIMM_FP_SCALAR(mnemonic, input, input_imm) \
   TEST(mnemonic##_H_2OPIMM) {                                         \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                            \
                                  H,                                   \
                                  H,                                   \
                                  kInputFloat16##Basic,                \
                                  kInput16bitsImm##input_imm);         \
   }                                                                   \
   TEST(mnemonic##_S_2OPIMM) {                                         \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                            \
                                  S,                                   \
                                  S,                                   \
                                  kInputFloat##Basic,                  \
                                  kInput32bitsImm##input_imm);         \
   }                                                                   \
   TEST(mnemonic##_D_2OPIMM) {                                         \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                            \
                                  D,                                   \
                                  D,                                   \
                                  kInputDouble##input,                 \
                                  kInput64bitsImm##input_imm);         \
   }

 #define DEFINE_TEST_NEON_2OPIMM_HSD(mnemonic, input, input_imm) \
   TEST(mnemonic##_4H_2OPIMM) {                                  \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                      \
                                  4H,                            \
                                  4H,                            \
                                  kInput16bits##input,           \
                                  kInput16bitsImm##input_imm);   \
   }                                                             \
   TEST(mnemonic##_8H_2OPIMM) {                                  \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                      \
                                  8H,                            \
                                  8H,                            \
                                  kInput16bits##input,           \
                                  kInput16bitsImm##input_imm);   \
   }                                                             \
   TEST(mnemonic##_2S_2OPIMM) {                                  \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                      \
                                  2S,                            \
                                  2S,                            \
                                  kInput32bits##input,           \
                                  kInput32bitsImm##input_imm);   \
   }                                                             \
   TEST(mnemonic##_4S_2OPIMM) {                                  \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                      \
                                  4S,                            \
                                  4S,                            \
                                  kInput32bits##input,           \
                                  kInput32bitsImm##input_imm);   \
   }                                                             \
   TEST(mnemonic##_2D_2OPIMM) {                                  \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                      \
                                  2D,                            \
                                  2D,                            \
                                  kInput64bits##input,           \
                                  kInput64bitsImm##input_imm);   \
   }

 #define DEFINE_TEST_NEON_2OPIMM_SCALAR_D(mnemonic, input, input_imm) \
   TEST(mnemonic##_D_2OPIMM) {                                        \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                           \
                                  D,                                  \
                                  D,                                  \
                                  kInput64bits##input,                \
                                  kInput64bitsImm##input_imm);        \
   }

 #define DEFINE_TEST_NEON_2OPIMM_SCALAR_HSD(mnemonic, input, input_imm) \
   TEST(mnemonic##_H_2OPIMM) {                                          \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                             \
                                  H,                                    \
                                  H,                                    \
                                  kInput16bits##input,                  \
                                  kInput16bitsImm##input_imm);          \
   }                                                                    \
   TEST(mnemonic##_S_2OPIMM) {                                          \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                             \
                                  S,                                    \
                                  S,                                    \
                                  kInput32bits##input,                  \
                                  kInput32bitsImm##input_imm);          \
   }                                                                    \
   DEFINE_TEST_NEON_2OPIMM_SCALAR_D(mnemonic, input, input_imm)

 #define DEFINE_TEST_NEON_2OPIMM_FP_SCALAR_D(mnemonic, input, input_imm) \
   TEST(mnemonic##_D_2OPIMM) {                                           \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                              \
                                  D,                                     \
                                  D,                                     \
                                  kInputDouble##input,                   \
                                  kInputDoubleImm##input_imm);           \
   }

 #define DEFINE_TEST_NEON_2OPIMM_FP_SCALAR_HSD(mnemonic, input, input_imm) \
   TEST(mnemonic##_H_2OPIMM) {                                             \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                                \
                                  H,                                       \
                                  H,                                       \
                                  kInputFloat16##input,                    \
                                  kInputDoubleImm##input_imm);             \
   }                                                                       \
   TEST(mnemonic##_S_2OPIMM) {                                             \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                                \
                                  S,                                       \
                                  S,                                       \
                                  kInputFloat##input,                      \
                                  kInputDoubleImm##input_imm);             \
   }                                                                       \
   DEFINE_TEST_NEON_2OPIMM_FP_SCALAR_D(mnemonic, input, input_imm)

 #define DEFINE_TEST_NEON_2OPIMM_SCALAR(mnemonic, input, input_imm) \
   TEST(mnemonic##_B_2OPIMM) {                                      \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                         \
                                  B,                                \
                                  B,                                \
                                  kInput8bits##input,               \
                                  kInput8bitsImm##input_imm);       \
   }                                                                \
   DEFINE_TEST_NEON_2OPIMM_SCALAR_HSD(mnemonic, input, input_imm)

 #define DEFINE_TEST_NEON_2OPIMM_LONG(mnemonic, input, input_imm) \
   TEST(mnemonic##_8H_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  8H,                             \
                                  8B,                             \
                                  kInput8bits##input,             \
                                  kInput8bitsImm##input_imm);     \
   }                                                              \
   TEST(mnemonic##_4S_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  4S,                             \
                                  4H,                             \
                                  kInput16bits##input,            \
                                  kInput16bitsImm##input_imm);    \
   }                                                              \
   TEST(mnemonic##_2D_2OPIMM) {                                   \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic,                       \
                                  2D,                             \
                                  2S,                             \
                                  kInput32bits##input,            \
                                  kInput32bitsImm##input_imm);    \
   }                                                              \
   TEST(mnemonic##2_8H_2OPIMM) {                                  \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic##2,                    \
                                  8H,                             \
                                  16B,                            \
                                  kInput8bits##input,             \
                                  kInput8bitsImm##input_imm);     \
   }                                                              \
   TEST(mnemonic##2_4S_2OPIMM) {                                  \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic##2,                    \
                                  4S,                             \
                                  8H,                             \
                                  kInput16bits##input,            \
                                  kInput16bitsImm##input_imm);    \
   }                                                              \
   TEST(mnemonic##2_2D_2OPIMM) {                                  \
     CALL_TEST_NEON_HELPER_2OPIMM(mnemonic##2,                    \
                                  2D,                             \
                                  4S,                             \
                                  kInput32bits##input,            \
                                  kInput32bitsImm##input_imm);    \
   }

 #define CALL_TEST_NEON_HELPER_BYELEMENT_DOT_PRODUCT(mnemonic,           \
                                                     vdform,             \
                                                     vnform,             \
                                                     vmform,             \
                                                     input_d,            \
                                                     input_n,            \
                                                     input_m,            \
                                                     indices,            \
                                                     vm_subvector_count) \
   {                                                                     \
     CALL_TEST_NEON_HELPER_ByElement_Dot_Product(mnemonic,               \
                                                 vdform,                 \
                                                 vnform,                 \
                                                 vmform,                 \
                                                 input_d,                \
                                                 input_n,                \
                                                 input_m,                \
                                                 indices,                \
                                                 vm_subvector_count);    \
   }

 #define DEFINE_TEST_NEON_BYELEMENT_DOT_PRODUCT(mnemonic,               \
                                                input_d,                \
                                                input_n,                \
                                                input_m)                \
   TEST(mnemonic##_2S_8B_B) {                                           \
     CALL_TEST_NEON_HELPER_BYELEMENT_DOT_PRODUCT(mnemonic,              \
                                                 2S,                    \
                                                 8B,                    \
                                                 B,                     \
                                                 kInput32bits##input_d, \
                                                 kInput8bits##input_n,  \
                                                 kInput8bits##input_m,  \
                                                 kInputSIndices,        \
                                                 4);                    \
   }                                                                    \
   TEST(mnemonic##_4S_16B_B) {                                          \
     CALL_TEST_NEON_HELPER_BYELEMENT_DOT_PRODUCT(mnemonic,              \
                                                 4S,                    \
                                                 16B,                   \
                                                 B,                     \
                                                 kInput32bits##input_d, \
                                                 kInput8bits##input_n,  \
                                                 kInput8bits##input_m,  \
                                                 kInputSIndices,        \
                                                 4);                    \
   }

 #define CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic, \
                                         vdform,   \
                                         vnform,   \
                                         vmform,   \
                                         input_d,  \
                                         input_n,  \
                                         input_m,  \
                                         indices)  \
   {                                               \
     CALL_TEST_NEON_HELPER_ByElement(mnemonic,     \
                                     vdform,       \
                                     vnform,       \
                                     vmform,       \
                                     input_d,      \
                                     input_n,      \
                                     input_m,      \
                                     indices);     \
   }

 #define DEFINE_TEST_NEON_BYELEMENT(mnemonic, input_d, input_n, input_m) \
   TEST(mnemonic##_4H_4H_H) {                                            \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                           \
                                     4H,                                 \
                                     4H,                                 \
                                     H,                                  \
                                     kInput16bits##input_d,              \
                                     kInput16bits##input_n,              \
                                     kInput16bits##input_m,              \
                                     kInputHIndices);                    \
   }                                                                     \
   TEST(mnemonic##_8H_8H_H) {                                            \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                           \
                                     8H,                                 \
                                     8H,                                 \
                                     H,                                  \
                                     kInput16bits##input_d,              \
                                     kInput16bits##input_n,              \
                                     kInput16bits##input_m,              \
                                     kInputHIndices);                    \
   }                                                                     \
   TEST(mnemonic##_2S_2S_S) {                                            \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                           \
                                     2S,                                 \
                                     2S,                                 \
                                     S,                                  \
                                     kInput32bits##input_d,              \
                                     kInput32bits##input_n,              \
                                     kInput32bits##input_m,              \
                                     kInputSIndices);                    \
   }                                                                     \
   TEST(mnemonic##_4S_4S_S) {                                            \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                           \
                                     4S,                                 \
                                     4S,                                 \
                                     S,                                  \
                                     kInput32bits##input_d,              \
                                     kInput32bits##input_n,              \
                                     kInput32bits##input_m,              \
                                     kInputSIndices);                    \
   }

 #define DEFINE_TEST_NEON_BYELEMENT_SCALAR(mnemonic, input_d, input_n, input_m) \
   TEST(mnemonic##_H_H_H) {                                                     \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                                  \
                                     H,                                         \
                                     H,                                         \
                                     H,                                         \
                                     kInput16bits##input_d,                     \
                                     kInput16bits##input_n,                     \
                                     kInput16bits##input_m,                     \
                                     kInputHIndices);                           \
   }                                                                            \
   TEST(mnemonic##_S_S_S) {                                                     \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                                  \
                                     S,                                         \
                                     S,                                         \
                                     S,                                         \
                                     kInput32bits##input_d,                     \
                                     kInput32bits##input_n,                     \
                                     kInput32bits##input_m,                     \
                                     kInputSIndices);                           \
   }

 #define DEFINE_TEST_NEON_FP_BYELEMENT(mnemonic, input_d, input_n, input_m) \
   TEST(mnemonic##_4H_4H_H) {                                               \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                              \
                                     4H,                                    \
                                     4H,                                    \
                                     H,                                     \
                                     kInputFloat16##input_d,                \
                                     kInputFloat16##input_n,                \
                                     kInputFloat16##input_m,                \
                                     kInputHIndices);                       \
   }                                                                        \
   TEST(mnemonic##_8H_8H_H) {                                               \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                              \
                                     8H,                                    \
                                     8H,                                    \
                                     H,                                     \
                                     kInputFloat16##input_d,                \
                                     kInputFloat16##input_n,                \
                                     kInputFloat16##input_m,                \
                                     kInputHIndices);                       \
   }                                                                        \
   TEST(mnemonic##_2S_2S_S) {                                               \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                              \
                                     2S,                                    \
                                     2S,                                    \
                                     S,                                     \
                                     kInputFloat##input_d,                  \
                                     kInputFloat##input_n,                  \
                                     kInputFloat##input_m,                  \
                                     kInputSIndices);                       \
   }                                                                        \
   TEST(mnemonic##_4S_4S_S) {                                               \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                              \
                                     4S,                                    \
                                     4S,                                    \
                                     S,                                     \
                                     kInputFloat##input_d,                  \
                                     kInputFloat##input_n,                  \
                                     kInputFloat##input_m,                  \
                                     kInputSIndices);                       \
   }                                                                        \
   TEST(mnemonic##_2D_2D_D) {                                               \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                              \
                                     2D,                                    \
                                     2D,                                    \
                                     D,                                     \
                                     kInputDouble##input_d,                 \
                                     kInputDouble##input_n,                 \
                                     kInputDouble##input_m,                 \
                                     kInputDIndices);                       \
   }

 #define DEFINE_TEST_NEON_FHM_BYELEMENT(mnemonic, input_d, input_n, input_m) \
   TEST(mnemonic##_2S_2H_H) {                                                \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                               \
                                     2S,                                     \
                                     2H,                                     \
                                     H,                                      \
                                     kInputFloatAccDestination,              \
                                     kInputFloat16##input_n,                 \
                                     kInputFloat16##input_m,                 \
                                     kInputHIndices);                        \
   }                                                                         \
   TEST(mnemonic##_4S_4H_H) {                                                \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                               \
                                     4S,                                     \
                                     4H,                                     \
                                     H,                                      \
                                     kInputFloatAccDestination,              \
                                     kInputFloat16##input_n,                 \
                                     kInputFloat16##input_m,                 \
                                     kInputHIndices);                        \
   }

 #define DEFINE_TEST_NEON_FP_BYELEMENT_SCALAR(mnemonic, inp_d, inp_n, inp_m) \
   TEST(mnemonic##_H_H_H) {                                                  \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                               \
                                     H,                                      \
                                     H,                                      \
                                     H,                                      \
                                     kInputFloat16##inp_d,                   \
                                     kInputFloat16##inp_n,                   \
                                     kInputFloat16##inp_m,                   \
                                     kInputHIndices);                        \
   }                                                                         \
   TEST(mnemonic##_S_S_S) {                                                  \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                               \
                                     S,                                      \
                                     S,                                      \
                                     S,                                      \
                                     kInputFloat##inp_d,                     \
                                     kInputFloat##inp_n,                     \
                                     kInputFloat##inp_m,                     \
                                     kInputSIndices);                        \
   }                                                                         \
   TEST(mnemonic##_D_D_D) {                                                  \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                               \
                                     D,                                      \
                                     D,                                      \
                                     D,                                      \
                                     kInputDouble##inp_d,                    \
                                     kInputDouble##inp_n,                    \
                                     kInputDouble##inp_m,                    \
                                     kInputDIndices);                        \
   }


 #define DEFINE_TEST_NEON_BYELEMENT_DIFF(mnemonic, input_d, input_n, input_m) \
   TEST(mnemonic##_4S_4H_H) {                                                 \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                                \
                                     4S,                                      \
                                     4H,                                      \
                                     H,                                       \
                                     kInput32bits##input_d,                   \
                                     kInput16bits##input_n,                   \
                                     kInput16bits##input_m,                   \
                                     kInputHIndices);                         \
   }                                                                          \
   TEST(mnemonic##2_4S_8H_H) {                                                \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic##2,                             \
                                     4S,                                      \
                                     8H,                                      \
                                     H,                                       \
                                     kInput32bits##input_d,                   \
                                     kInput16bits##input_n,                   \
                                     kInput16bits##input_m,                   \
                                     kInputHIndices);                         \
   }                                                                          \
   TEST(mnemonic##_2D_2S_S) {                                                 \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,                                \
                                     2D,                                      \
                                     2S,                                      \
                                     S,                                       \
                                     kInput64bits##input_d,                   \
                                     kInput32bits##input_n,                   \
                                     kInput32bits##input_m,                   \
                                     kInputSIndices);                         \
   }                                                                          \
   TEST(mnemonic##2_2D_4S_S) {                                                \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic##2,                             \
                                     2D,                                      \
                                     4S,                                      \
                                     S,                                       \
                                     kInput64bits##input_d,                   \
                                     kInput32bits##input_n,                   \
                                     kInput32bits##input_m,                   \
                                     kInputSIndices);                         \
   }

 #define DEFINE_TEST_NEON_BYELEMENT_DIFF_SCALAR(mnemonic,   \
                                                input_d,    \
                                                input_n,    \
                                                input_m)    \
   TEST(mnemonic##_S_H_H) {                                 \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,              \
                                     S,                     \
                                     H,                     \
                                     H,                     \
                                     kInput32bits##input_d, \
                                     kInput16bits##input_n, \
                                     kInput16bits##input_m, \
                                     kInputHIndices);       \
   }                                                        \
   TEST(mnemonic##_D_S_S) {                                 \
     CALL_TEST_NEON_HELPER_BYELEMENT(mnemonic,              \
                                     D,                     \
                                     S,                     \
                                     S,                     \
                                     kInput64bits##input_d, \
                                     kInput32bits##input_n, \
                                     kInput32bits##input_m, \
                                     kInputSIndices);       \
   }


 #define CALL_TEST_NEON_HELPER_2OP2IMM(mnemonic,                 \
                                       variant,                  \
                                       input_d,                  \
                                       input_imm1,               \
                                       input_n,                  \
                                       input_imm2)               \
   {                                                             \
     CALL_TEST_NEON_HELPER_OpImmOpImm(&MacroAssembler::mnemonic, \
                                      mnemonic,                  \
                                      variant,                   \
                                      variant,                   \
                                      input_d,                   \
                                      input_imm1,                \
                                      input_n,                   \
                                      input_imm2);               \
   }

 #define DEFINE_TEST_NEON_2OP2IMM(mnemonic,                      \
                                  input_d,                       \
                                  input_imm1,                    \
                                  input_n,                       \
                                  input_imm2)                    \
   TEST(mnemonic##_B) {                                          \
     CALL_TEST_NEON_HELPER_2OP2IMM(mnemonic,                     \
                                   16B,                          \
                                   kInput8bits##input_d,         \
                                   kInput8bitsImm##input_imm1,   \
                                   kInput8bits##input_n,         \
                                   kInput8bitsImm##input_imm2);  \
   }                                                             \
   TEST(mnemonic##_H) {                                          \
     CALL_TEST_NEON_HELPER_2OP2IMM(mnemonic,                     \
                                   8H,                           \
                                   kInput16bits##input_d,        \
                                   kInput16bitsImm##input_imm1,  \
                                   kInput16bits##input_n,        \
                                   kInput16bitsImm##input_imm2); \
   }                                                             \
   TEST(mnemonic##_S) {                                          \
     CALL_TEST_NEON_HELPER_2OP2IMM(mnemonic,                     \
                                   4S,                           \
                                   kInput32bits##input_d,        \
                                   kInput32bitsImm##input_imm1,  \
                                   kInput32bits##input_n,        \
                                   kInput32bitsImm##input_imm2); \
   }                                                             \
   TEST(mnemonic##_D) {                                          \
     CALL_TEST_NEON_HELPER_2OP2IMM(mnemonic,                     \
                                   2D,                           \
                                   kInput64bits##input_d,        \
                                   kInput64bitsImm##input_imm1,  \
                                   kInput64bits##input_n,        \
                                   kInput64bitsImm##input_imm2); \
   }


 // Advanced SIMD copy.
 DEFINE_TEST_NEON_2OP2IMM(
     ins, Basic, LaneCountFromZero, Basic, LaneCountFromZero)
 DEFINE_TEST_NEON_2OPIMM_COPY(dup, Basic, LaneCountFromZero)


 // Advanced SIMD scalar copy.
 DEFINE_TEST_NEON_2OPIMM_SCALAR(dup, Basic, LaneCountFromZero)


 // Advanced SIMD three same.
 DEFINE_TEST_NEON_3SAME_NO2D(shadd, Basic)
 DEFINE_TEST_NEON_3SAME(sqadd, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(srhadd, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(shsub, Basic)
 DEFINE_TEST_NEON_3SAME(sqsub, Basic)
 DEFINE_TEST_NEON_3SAME(cmgt, Basic)
 DEFINE_TEST_NEON_3SAME(cmge, Basic)
 DEFINE_TEST_NEON_3SAME(sshl, Basic)
 DEFINE_TEST_NEON_3SAME(sqshl, Basic)
 DEFINE_TEST_NEON_3SAME(srshl, Basic)
 DEFINE_TEST_NEON_3SAME(sqrshl, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(smax, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(smin, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(sabd, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(saba, Basic)
 DEFINE_TEST_NEON_3SAME(add, Basic)
 DEFINE_TEST_NEON_3SAME(cmtst, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(mla, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(mul, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(smaxp, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(sminp, Basic)
 DEFINE_TEST_NEON_3SAME_HS(sqdmulh, Basic)
 DEFINE_TEST_NEON_3SAME(addp, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fmaxnm, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fmla, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fadd, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fmulx, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fcmeq, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fmax, Basic)
 DEFINE_TEST_NEON_3SAME_FP(frecps, Basic)
 DEFINE_TEST_NEON_3SAME_8B_16B(and_, Basic)
 DEFINE_TEST_NEON_3SAME_8B_16B(bic, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fminnm, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fmls, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fsub, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fmin, Basic)
 DEFINE_TEST_NEON_3SAME_FP(frsqrts, Basic)
 DEFINE_TEST_NEON_3SAME_8B_16B(orr, Basic)
 DEFINE_TEST_NEON_3SAME_8B_16B(orn, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(uhadd, Basic)
 DEFINE_TEST_NEON_3SAME(uqadd, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(urhadd, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(uhsub, Basic)
 DEFINE_TEST_NEON_3SAME(uqsub, Basic)
 DEFINE_TEST_NEON_3SAME(cmhi, Basic)
 DEFINE_TEST_NEON_3SAME(cmhs, Basic)
 DEFINE_TEST_NEON_3SAME(ushl, Basic)
 DEFINE_TEST_NEON_3SAME(uqshl, Basic)
 DEFINE_TEST_NEON_3SAME(urshl, Basic)
 DEFINE_TEST_NEON_3SAME(uqrshl, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(umax, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(umin, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(uabd, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(uaba, Basic)
 DEFINE_TEST_NEON_3SAME(sub, Basic)
 DEFINE_TEST_NEON_3SAME(cmeq, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(mls, Basic)
 DEFINE_TEST_NEON_3SAME_8B_16B(pmul, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(uminp, Basic)
 DEFINE_TEST_NEON_3SAME_NO2D(umaxp, Basic)
 DEFINE_TEST_NEON_3SAME_HS(sqrdmulh, Basic)
 DEFINE_TEST_NEON_3SAME_HS(sqrdmlah, Basic)
 DEFINE_TEST_NEON_3SAME_HS(sqrdmlsh, Basic)
 DEFINE_TEST_NEON_3DIFF_DOUBLE_WIDE(udot, Basic)
 DEFINE_TEST_NEON_3DIFF_DOUBLE_WIDE(sdot, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fmaxnmp, Basic)
 DEFINE_TEST_NEON_3SAME_FP(faddp, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fmul, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fcmge, Basic)
 DEFINE_TEST_NEON_3SAME_FP(facge, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fmaxp, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fdiv, Basic)
 DEFINE_TEST_NEON_3SAME_8B_16B(eor, Basic)
 DEFINE_TEST_NEON_3SAME_8B_16B(bsl, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fminnmp, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fabd, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fcmgt, Basic)
 DEFINE_TEST_NEON_3SAME_FP(facgt, Basic)
 DEFINE_TEST_NEON_3SAME_FP(fminp, Basic)
 DEFINE_TEST_NEON_3SAME_8B_16B(bit, Basic)
 DEFINE_TEST_NEON_3SAME_8B_16B(bif, Basic)


 // Advanced SIMD scalar three same.
 DEFINE_TEST_NEON_3SAME_SCALAR(sqadd, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR(sqsub, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(cmgt, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(cmge, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(sshl, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR(sqshl, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(srshl, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR(sqrshl, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(add, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(cmtst, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_HS(sqdmulh, Basic)
 DEFINE_TEST_NEON_3SAME_FP_SCALAR(fmulx, Basic)
 DEFINE_TEST_NEON_3SAME_FP_SCALAR(fcmeq, Basic)
 DEFINE_TEST_NEON_3SAME_FP_SCALAR(frecps, Basic)
 DEFINE_TEST_NEON_3SAME_FP_SCALAR(frsqrts, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(uqadd, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(uqsub, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(cmhi, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(cmhs, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(ushl, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR(uqshl, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(urshl, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR(uqrshl, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(sub, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_D(cmeq, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_HS(sqrdmulh, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_HS(sqrdmlah, Basic)
 DEFINE_TEST_NEON_3SAME_SCALAR_HS(sqrdmlsh, Basic)
 DEFINE_TEST_NEON_3SAME_FP_SCALAR(fcmge, Basic)
 DEFINE_TEST_NEON_3SAME_FP_SCALAR(facge, Basic)
 DEFINE_TEST_NEON_3SAME_FP_SCALAR(fabd, Basic)
 DEFINE_TEST_NEON_3SAME_FP_SCALAR(fcmgt, Basic)
 DEFINE_TEST_NEON_3SAME_FP_SCALAR(facgt, Basic)


 // Advanced SIMD FHM instructions (FMLAL, FMLSL).
 // These are oddballs: they are encoded under the 3SAME group but behave
 // quite differently.
 DEFINE_TEST_NEON_FHM(fmlal, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FHM(fmlal2, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FHM(fmlsl, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FHM(fmlsl2, Basic, Basic, Basic)


 // Advanced SIMD three different.
 DEFINE_TEST_NEON_3DIFF_LONG(saddl, Basic)
 DEFINE_TEST_NEON_3DIFF_WIDE(saddw, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(ssubl, Basic)
 DEFINE_TEST_NEON_3DIFF_WIDE(ssubw, Basic)
 DEFINE_TEST_NEON_3DIFF_NARROW(addhn, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(sabal, Basic)
 DEFINE_TEST_NEON_3DIFF_NARROW(subhn, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(sabdl, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(smlal, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG_SD(sqdmlal, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(smlsl, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG_SD(sqdmlsl, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(smull, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG_SD(sqdmull, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG_8H(pmull, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(uaddl, Basic)
 DEFINE_TEST_NEON_3DIFF_WIDE(uaddw, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(usubl, Basic)
 DEFINE_TEST_NEON_3DIFF_WIDE(usubw, Basic)
 DEFINE_TEST_NEON_3DIFF_NARROW(raddhn, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(uabal, Basic)
 DEFINE_TEST_NEON_3DIFF_NARROW(rsubhn, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(uabdl, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(umlal, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(umlsl, Basic)
 DEFINE_TEST_NEON_3DIFF_LONG(umull, Basic)


 // Advanced SIMD scalar three different.
 DEFINE_TEST_NEON_3DIFF_SCALAR_LONG_SD(sqdmlal, Basic)
 DEFINE_TEST_NEON_3DIFF_SCALAR_LONG_SD(sqdmlsl, Basic)
 DEFINE_TEST_NEON_3DIFF_SCALAR_LONG_SD(sqdmull, Basic)


 // Advanced SIMD scalar pairwise.
 TEST(addp_SCALAR) {
   CALL_TEST_NEON_HELPER_2DIFF(addp, D, 2D, kInput64bitsBasic);
 }
 DEFINE_TEST_NEON_2DIFF_FP_SCALAR_SD(fmaxnmp, Basic)
 DEFINE_TEST_NEON_2DIFF_FP_SCALAR_SD(faddp, Basic)
 DEFINE_TEST_NEON_2DIFF_FP_SCALAR_SD(fmaxp, Basic)
 DEFINE_TEST_NEON_2DIFF_FP_SCALAR_SD(fminnmp, Basic)
 DEFINE_TEST_NEON_2DIFF_FP_SCALAR_SD(fminp, Basic)


 // Advanced SIMD shift by immediate.
 DEFINE_TEST_NEON_2OPIMM(sshr, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM(ssra, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM(srshr, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM(srsra, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM(shl, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM(sqshl, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM_NARROW(shrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_NARROW(rshrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_NARROW(sqshrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_NARROW(sqrshrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_LONG(sshll, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM_HSD(scvtf,
                             FixedPointConversions,
                             TypeWidthFromZeroToWidth)
 DEFINE_TEST_NEON_2OPIMM_FP(fcvtzs, Conversions, TypeWidthFromZeroToWidth)
 DEFINE_TEST_NEON_2OPIMM(ushr, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM(usra, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM(urshr, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM(ursra, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM(sri, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM(sli, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM(sqshlu, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM(uqshl, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM_NARROW(sqshrun, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_NARROW(sqrshrun, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_NARROW(uqshrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_NARROW(uqrshrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_LONG(ushll, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM_HSD(ucvtf,
                             FixedPointConversions,
                             TypeWidthFromZeroToWidth)
 DEFINE_TEST_NEON_2OPIMM_FP(fcvtzu, Conversions, TypeWidthFromZeroToWidth)


 // Advanced SIMD scalar shift by immediate..
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(sshr, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(ssra, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(srshr, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(srsra, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(shl, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM_SCALAR(sqshl, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_NARROW(sqshrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_NARROW(sqrshrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_HSD(scvtf,
                                    FixedPointConversions,
                                    TypeWidthFromZeroToWidth)
 DEFINE_TEST_NEON_2OPIMM_FP_SCALAR(fcvtzs, Conversions, TypeWidthFromZeroToWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(ushr, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(usra, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(urshr, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(ursra, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(sri, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(sli, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM_SCALAR(sqshlu, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM_SCALAR(uqshl, Basic, TypeWidthFromZero)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_NARROW(sqshrun, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_NARROW(sqrshrun, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_NARROW(uqshrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_NARROW(uqrshrn, Basic, TypeWidth)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_HSD(ucvtf,
                                    FixedPointConversions,
                                    TypeWidthFromZeroToWidth)
 DEFINE_TEST_NEON_2OPIMM_FP_SCALAR(fcvtzu, Conversions, TypeWidthFromZeroToWidth)


 // Advanced SIMD two-register miscellaneous.
 DEFINE_TEST_NEON_2SAME_NO2D(rev64, Basic)
 DEFINE_TEST_NEON_2SAME_8B_16B(rev16, Basic)
 DEFINE_TEST_NEON_2DIFF_LONG(saddlp, Basic)
 DEFINE_TEST_NEON_2SAME(suqadd, Basic)
 DEFINE_TEST_NEON_2SAME_NO2D(cls, Basic)
 DEFINE_TEST_NEON_2SAME_8B_16B(cnt, Basic)
 DEFINE_TEST_NEON_2DIFF_LONG(sadalp, Basic)
 DEFINE_TEST_NEON_2SAME(sqabs, Basic)
 DEFINE_TEST_NEON_2OPIMM(cmgt, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM(cmeq, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM(cmlt, Basic, Zero)
 DEFINE_TEST_NEON_2SAME(abs, Basic)
 DEFINE_TEST_NEON_2DIFF_NARROW(xtn, Basic)
 DEFINE_TEST_NEON_2DIFF_NARROW(sqxtn, Basic)
 DEFINE_TEST_NEON_2DIFF_FP_NARROW(fcvtn, Conversions)
 DEFINE_TEST_NEON_2DIFF_FP_LONG(fcvtl, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(frintn, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(frintm, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fcvtns, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fcvtms, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fcvtas, Conversions)
 // SCVTF (vector, integer) covered by SCVTF(vector, fixed point) with fbits 0.
 DEFINE_TEST_NEON_2OPIMM_FCMP_ZERO(fcmgt, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM_FCMP_ZERO(fcmeq, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM_FCMP_ZERO(fcmlt, Basic, Zero)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fabs, Basic)
 DEFINE_TEST_NEON_2SAME_FP_FP16(frintp, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(frintz, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fcvtps, Conversions)
 // FCVTZS(vector, integer) covered by FCVTZS(vector, fixed point) with fbits 0.
 DEFINE_TEST_NEON_2SAME_2S_4S(urecpe, Basic)
 DEFINE_TEST_NEON_2SAME_FP_FP16(frecpe, Basic)
 DEFINE_TEST_NEON_2SAME_BH(rev32, Basic)
 DEFINE_TEST_NEON_2DIFF_LONG(uaddlp, Basic)
 DEFINE_TEST_NEON_2SAME(usqadd, Basic)
 DEFINE_TEST_NEON_2SAME_NO2D(clz, Basic)
 DEFINE_TEST_NEON_2DIFF_LONG(uadalp, Basic)
 DEFINE_TEST_NEON_2SAME(sqneg, Basic)
 DEFINE_TEST_NEON_2OPIMM(cmge, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM(cmle, Basic, Zero)
 DEFINE_TEST_NEON_2SAME(neg, Basic)
 DEFINE_TEST_NEON_2DIFF_NARROW(sqxtun, Basic)
 DEFINE_TEST_NEON_2OPIMM_LONG(shll, Basic, SHLL)
 DEFINE_TEST_NEON_2DIFF_NARROW(uqxtn, Basic)
 DEFINE_TEST_NEON_2DIFF_FP_NARROW_2S(fcvtxn, Conversions)
 DEFINE_TEST_NEON_2SAME_FP(frint32x, Conversions)
 DEFINE_TEST_NEON_2SAME_FP(frint64x, Conversions)
 DEFINE_TEST_NEON_2SAME_FP(frint32z, Conversions)
 DEFINE_TEST_NEON_2SAME_FP(frint64z, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(frinta, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(frintx, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fcvtnu, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fcvtmu, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fcvtau, Conversions)
 // UCVTF (vector, integer) covered by UCVTF(vector, fixed point) with fbits 0.
 DEFINE_TEST_NEON_2SAME_8B_16B(not_, Basic)
 DEFINE_TEST_NEON_2SAME_8B_16B(rbit, Basic)
 DEFINE_TEST_NEON_2OPIMM_FCMP_ZERO(fcmge, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM_FCMP_ZERO(fcmle, Basic, Zero)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fneg, Basic)
 DEFINE_TEST_NEON_2SAME_FP_FP16(frinti, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fcvtpu, Conversions)
 // FCVTZU(vector, integer) covered by FCVTZU(vector, fixed point) with fbits 0.
 DEFINE_TEST_NEON_2SAME_2S_4S(ursqrte, Basic)
 DEFINE_TEST_NEON_2SAME_FP_FP16(frsqrte, Basic)
 DEFINE_TEST_NEON_2SAME_FP_FP16(fsqrt, Basic)


 // Advanced SIMD scalar two-register miscellaneous.
 DEFINE_TEST_NEON_2SAME_SCALAR(suqadd, Basic)
 DEFINE_TEST_NEON_2SAME_SCALAR(sqabs, Basic)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(cmgt, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(cmeq, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(cmlt, Basic, Zero)
 DEFINE_TEST_NEON_2SAME_SCALAR_D(abs, Basic)
 DEFINE_TEST_NEON_2DIFF_SCALAR_NARROW(sqxtn, Basic)
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(fcvtns, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(fcvtms, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(fcvtas, Conversions)
 // SCVTF (vector, integer) covered by SCVTF(vector, fixed point) with fbits 0.
 DEFINE_TEST_NEON_2OPIMM_FP_SCALAR_HSD(fcmgt, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM_FP_SCALAR_HSD(fcmeq, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM_FP_SCALAR_HSD(fcmlt, Basic, Zero)
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(fcvtps, Conversions)
 // FCVTZS(vector, integer) covered by FCVTZS(vector, fixed point) with fbits 0.
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(frecpe, Basic)
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(frecpx, Basic)
 DEFINE_TEST_NEON_2SAME_SCALAR(usqadd, Basic)
 DEFINE_TEST_NEON_2SAME_SCALAR(sqneg, Basic)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(cmge, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM_SCALAR_D(cmle, Basic, Zero)
 DEFINE_TEST_NEON_2SAME_SCALAR_D(neg, Basic)
 DEFINE_TEST_NEON_2DIFF_SCALAR_NARROW(sqxtun, Basic)
 DEFINE_TEST_NEON_2DIFF_SCALAR_NARROW(uqxtn, Basic)
 TEST(fcvtxn_SCALAR) {
   CALL_TEST_NEON_HELPER_2DIFF(fcvtxn, S, D, kInputDoubleConversions);
 }
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(fcvtnu, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(fcvtmu, Conversions)
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(fcvtau, Conversions)
 // UCVTF (vector, integer) covered by UCVTF(vector, fixed point) with fbits 0.
 DEFINE_TEST_NEON_2OPIMM_FP_SCALAR_HSD(fcmge, Basic, Zero)
 DEFINE_TEST_NEON_2OPIMM_FP_SCALAR_HSD(fcmle, Basic, Zero)
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(fcvtpu, Conversions)
 // FCVTZU(vector, integer) covered by FCVTZU(vector, fixed point) with fbits 0.
 DEFINE_TEST_NEON_2SAME_FP_FP16_SCALAR(frsqrte, Basic)


 // Advanced SIMD across lanes.
 DEFINE_TEST_NEON_ACROSS_LONG(saddlv, Basic)
 DEFINE_TEST_NEON_ACROSS(smaxv, Basic)
 DEFINE_TEST_NEON_ACROSS(sminv, Basic)
 DEFINE_TEST_NEON_ACROSS(addv, Basic)
 DEFINE_TEST_NEON_ACROSS_LONG(uaddlv, Basic)
 DEFINE_TEST_NEON_ACROSS(umaxv, Basic)
 DEFINE_TEST_NEON_ACROSS(uminv, Basic)
 DEFINE_TEST_NEON_ACROSS_FP(fmaxnmv, Basic)
 DEFINE_TEST_NEON_ACROSS_FP(fmaxv, Basic)
 DEFINE_TEST_NEON_ACROSS_FP(fminnmv, Basic)
 DEFINE_TEST_NEON_ACROSS_FP(fminv, Basic)


 // Advanced SIMD permute.
 DEFINE_TEST_NEON_3SAME(uzp1, Basic)
 DEFINE_TEST_NEON_3SAME(trn1, Basic)
 DEFINE_TEST_NEON_3SAME(zip1, Basic)
 DEFINE_TEST_NEON_3SAME(uzp2, Basic)
 DEFINE_TEST_NEON_3SAME(trn2, Basic)
 DEFINE_TEST_NEON_3SAME(zip2, Basic)


 // Advanced SIMD vector x indexed element.
 DEFINE_TEST_NEON_BYELEMENT_DIFF(smlal, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF(sqdmlal, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF(smlsl, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF(sqdmlsl, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT(mul, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF(smull, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF(sqdmull, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT(sqdmulh, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT(sqrdmulh, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT(sqrdmlah, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT(sqrdmlsh, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DOT_PRODUCT(udot, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DOT_PRODUCT(sdot, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FP_BYELEMENT(fmla, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FP_BYELEMENT(fmls, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FP_BYELEMENT(fmul, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT(mla, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF(umlal, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT(mls, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF(umlsl, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF(umull, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FP_BYELEMENT(fmulx, Basic, Basic, Basic)


 // Advanced SIMD scalar x indexed element.
 DEFINE_TEST_NEON_BYELEMENT_DIFF_SCALAR(sqdmlal, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF_SCALAR(sqdmlsl, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_DIFF_SCALAR(sqdmull, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_SCALAR(sqdmulh, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_SCALAR(sqrdmulh, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_SCALAR(sqrdmlah, Basic, Basic, Basic)
 DEFINE_TEST_NEON_BYELEMENT_SCALAR(sqrdmlsh, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FP_BYELEMENT_SCALAR(fmla, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FP_BYELEMENT_SCALAR(fmls, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FP_BYELEMENT_SCALAR(fmul, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FP_BYELEMENT_SCALAR(fmulx, Basic, Basic, Basic)


 DEFINE_TEST_NEON_FHM_BYELEMENT(fmlal, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FHM_BYELEMENT(fmlal2, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FHM_BYELEMENT(fmlsl, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FHM_BYELEMENT(fmlsl2, Basic, Basic, Basic)


 #ifdef VIXL_ENABLE_IMPLICIT_CHECKS
 TEST(ImplicitCheck) {
   SETUP_WITH_FEATURES(CPUFeatures::kNEON);
   START_IMPLICIT_CHECK();

   EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
   // Invalid memory reads.
   __ ldar(w3, bad_memory);
   __ ldar(x4, bad_memory);
   __ ldarb(w5, bad_memory);
   __ ldarb(x6, bad_memory);
   __ ldarh(w7, bad_memory);
   __ ldarh(x8, bad_memory);
   __ ldaxp(w9, w10, bad_memory);
   __ ldaxp(x11, x12, bad_memory);
   __ ldaxr(w13, bad_memory);
   __ ldaxr(x14, bad_memory);
   __ ldaxrb(w15, bad_memory);
   __ ldaxrb(x16, bad_memory);
   __ ldaxrh(w17, bad_memory);
   __ ldaxrh(x18, bad_memory);
   __ ldnp(w19, w20, bad_memory);
   __ ldnp(x21, x22, bad_memory);
   __ ldp(w23, w24, bad_memory);
   __ ldp(x25, x26, bad_memory);
   __ ldpsw(x27, x28, bad_memory);
   __ ldr(w29, bad_memory);
   __ ldr(x2, bad_memory);
   __ ldrb(w3, bad_memory);
   __ ldrb(x4, bad_memory);
   __ ldrh(w5, bad_memory);
   __ ldrh(x6, bad_memory);
   __ ldrsb(w7, bad_memory);
   __ ldrsb(x8, bad_memory);
   __ ldrsh(w9, bad_memory);
   __ ldrsh(x10, bad_memory);
   __ ldrsw(x11, bad_memory);
   __ ldur(w12, bad_memory);
   __ ldur(x13, bad_memory);
   __ ldurb(w14, bad_memory);
   __ ldurb(x15, bad_memory);
   __ ldurh(w16, bad_memory);
   __ ldurh(x17, bad_memory);
   __ ldursb(w18, bad_memory);
   __ ldursb(x19, bad_memory);
   __ ldursh(w20, bad_memory);
   __ ldursh(x21, bad_memory);
   __ ldursw(x22, bad_memory);
   __ ldxp(w23, w24, bad_memory);
   __ ldxp(x25, x26, bad_memory);
   __ ldxr(w27, bad_memory);
   __ ldxr(x28, bad_memory);
   __ ldxrb(w29, bad_memory);
   __ ldxrb(x2, bad_memory);
   __ ldxrh(w3, bad_memory);
   __ ldxrh(x4, bad_memory);

   // Invalid memory writes. Note: exclusive store instructions are not tested
   // because they can fail due to the global monitor before trying to perform a
   // memory store.
   __ stlr(w18, bad_memory);
   __ stlr(x19, bad_memory);
   __ stlrb(w20, bad_memory);
   __ stlrb(x21, bad_memory);
   __ stlrh(w22, bad_memory);
   __ stlrh(x23, bad_memory);
   __ stnp(w14, w15, bad_memory);
   __ stnp(x16, x17, bad_memory);
   __ stp(w18, w19, bad_memory);
   __ stp(x20, x21, bad_memory);
   __ str(w22, bad_memory);
   __ str(x23, bad_memory);
   __ strb(w24, bad_memory);
   __ strb(x25, bad_memory);
   __ strh(w26, bad_memory);
   __ strh(x27, bad_memory);
   __ stur(w28, bad_memory);
   __ stur(x29, bad_memory);
   __ sturb(w2, bad_memory);
   __ sturb(x3, bad_memory);
   __ sturh(w4, bad_memory);
   __ sturh(x5, bad_memory);

   END_IMPLICIT_CHECK();
   TRY_RUN_IMPLICIT_CHECK();
 }

 TEST(ImplicitCheckNeon) {
   SETUP_WITH_FEATURES(CPUFeatures::kNEON);
   START_IMPLICIT_CHECK();

   EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
   __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), bad_memory);
   __ ld1(v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), bad_memory);
   __ ld1(v5.V16B(), v6.V16B(), v7.V16B(), v8.V16B(), bad_memory);
   __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), bad_memory);
   __ ld1(v13.V16B(), v14.V16B(), v15.V16B(), bad_memory);
   __ ld1(v19.V16B(), v20.V16B(), v21.V16B(), bad_memory);
   __ ld1(v17.V16B(), v18.V16B(), bad_memory);
   __ ld1(v20.V16B(), v21.V16B(), bad_memory);
   __ ld1(v28.V16B(), v29.V16B(), bad_memory);
   __ ld1(v29.V16B(), bad_memory);
   __ ld1(v21.V16B(), bad_memory);
   __ ld1(v4.V16B(), bad_memory);
   __ ld1(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), bad_memory);
   __ ld1(v17.V1D(), v18.V1D(), v19.V1D(), v20.V1D(), bad_memory);
   __ ld1(v28.V1D(), v29.V1D(), v30.V1D(), v31.V1D(), bad_memory);
   __ ld1(v20.V1D(), v21.V1D(), v22.V1D(), bad_memory);
   __ ld1(v19.V1D(), v20.V1D(), v21.V1D(), bad_memory);
   __ ld1(v12.V1D(), v13.V1D(), v14.V1D(), bad_memory);
   __ ld1(v29.V1D(), v30.V1D(), bad_memory);
   __ ld1(v31.V1D(), v0.V1D(), bad_memory);
   __ ld1(v3.V1D(), v4.V1D(), bad_memory);
   __ ld1(v28.V1D(), bad_memory);
   __ ld1(v11.V1D(), bad_memory);
   __ ld1(v29.V1D(), bad_memory);
   __ ld1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), bad_memory);
   __ ld1(v8.V2D(), v9.V2D(), v10.V2D(), v11.V2D(), bad_memory);
   __ ld1(v14.V2D(), v15.V2D(), v16.V2D(), v17.V2D(), bad_memory);
   __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), bad_memory);
   __ ld1(v5.V2D(), v6.V2D(), v7.V2D(), bad_memory);
   __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), bad_memory);
   __ ld1(v18.V2D(), v19.V2D(), bad_memory);
   __ ld1(v21.V2D(), v22.V2D(), bad_memory);
   __ ld1(v17.V2D(), v18.V2D(), bad_memory);
   __ ld1(v5.V2D(), bad_memory);
   __ ld1(v6.V2D(), bad_memory);
   __ ld1(v15.V2D(), bad_memory);
   __ ld1(v30.V2S(), v31.V2S(), v0.V2S(), v1.V2S(), bad_memory);
   __ ld1(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), bad_memory);
   __ ld1(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), bad_memory);
   __ ld1(v11.V2S(), v12.V2S(), v13.V2S(), bad_memory);
   __ ld1(v8.V2S(), v9.V2S(), v10.V2S(), bad_memory);
   __ ld1(v31.V2S(), v0.V2S(), v1.V2S(), bad_memory);
   __ ld1(v0.V2S(), v1.V2S(), bad_memory);
   __ ld1(v13.V2S(), v14.V2S(), bad_memory);
   __ ld1(v3.V2S(), v4.V2S(), bad_memory);
   __ ld1(v26.V2S(), bad_memory);
   __ ld1(v0.V2S(), bad_memory);
   __ ld1(v11.V2S(), bad_memory);
   __ ld1(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), bad_memory);
   __ ld1(v24.V4H(), v25.V4H(), v26.V4H(), v27.V4H(), bad_memory);
   __ ld1(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), bad_memory);
   __ ld1(v30.V4H(), v31.V4H(), v0.V4H(), bad_memory);
   __ ld1(v25.V4H(), v26.V4H(), v27.V4H(), bad_memory);
   __ ld1(v3.V4H(), v4.V4H(), v5.V4H(), bad_memory);
   __ ld1(v3.V4H(), v4.V4H(), bad_memory);
   __ ld1(v3.V4H(), v4.V4H(), bad_memory);
   __ ld1(v23.V4H(), v24.V4H(), bad_memory);
   __ ld1(v26.V4H(), bad_memory);
   __ ld1(v1.V4H(), bad_memory);
   __ ld1(v14.V4H(), bad_memory);
   __ ld1(v26.V4S(), v27.V4S(), v28.V4S(), v29.V4S(), bad_memory);
   __ ld1(v28.V4S(), v29.V4S(), v30.V4S(), v31.V4S(), bad_memory);
   __ ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), bad_memory);
   __ ld1(v2.V4S(), v3.V4S(), v4.V4S(), bad_memory);
   __ ld1(v22.V4S(), v23.V4S(), v24.V4S(), bad_memory);
   __ ld1(v15.V4S(), v16.V4S(), v17.V4S(), bad_memory);
   __ ld1(v20.V4S(), v21.V4S(), bad_memory);
   __ ld1(v30.V4S(), v31.V4S(), bad_memory);
   __ ld1(v11.V4S(), v12.V4S(), bad_memory);
   __ ld1(v15.V4S(), bad_memory);
   __ ld1(v12.V4S(), bad_memory);
   __ ld1(v0.V4S(), bad_memory);
   __ ld1(v17.V8B(), v18.V8B(), v19.V8B(), v20.V8B(), bad_memory);
   __ ld1(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), bad_memory);
   __ ld1(v9.V8B(), v10.V8B(), v11.V8B(), v12.V8B(), bad_memory);
   __ ld1(v4.V8B(), v5.V8B(), v6.V8B(), bad_memory);
   __ ld1(v2.V8B(), v3.V8B(), v4.V8B(), bad_memory);
   __ ld1(v12.V8B(), v13.V8B(), v14.V8B(), bad_memory);
   __ ld1(v10.V8B(), v11.V8B(), bad_memory);
   __ ld1(v11.V8B(), v12.V8B(), bad_memory);
   __ ld1(v27.V8B(), v28.V8B(), bad_memory);
   __ ld1(v31.V8B(), bad_memory);
   __ ld1(v10.V8B(), bad_memory);
   __ ld1(v28.V8B(), bad_memory);
   __ ld1(v5.V8H(), v6.V8H(), v7.V8H(), v8.V8H(), bad_memory);
   __ ld1(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), bad_memory);
   __ ld1(v10.V8H(), v11.V8H(), v12.V8H(), v13.V8H(), bad_memory);
   __ ld1(v26.V8H(), v27.V8H(), v28.V8H(), bad_memory);
   __ ld1(v3.V8H(), v4.V8H(), v5.V8H(), bad_memory);
   __ ld1(v17.V8H(), v18.V8H(), v19.V8H(), bad_memory);
   __ ld1(v4.V8H(), v5.V8H(), bad_memory);
   __ ld1(v21.V8H(), v22.V8H(), bad_memory);
   __ ld1(v4.V8H(), v5.V8H(), bad_memory);
   __ ld1(v9.V8H(), bad_memory);
   __ ld1(v27.V8H(), bad_memory);
   __ ld1(v26.V8H(), bad_memory);
   __ ld1(v19.B(), 1, bad_memory);
   __ ld1(v12.B(), 3, bad_memory);
   __ ld1(v27.B(), 12, bad_memory);
   __ ld1(v10.D(), 1, bad_memory);
   __ ld1(v26.D(), 1, bad_memory);
   __ ld1(v7.D(), 1, bad_memory);
   __ ld1(v19.H(), 5, bad_memory);
   __ ld1(v10.H(), 1, bad_memory);
   __ ld1(v5.H(), 4, bad_memory);
   __ ld1(v21.S(), 2, bad_memory);
   __ ld1(v13.S(), 2, bad_memory);
   __ ld1(v1.S(), 2, bad_memory);
   __ ld1r(v2.V16B(), bad_memory);
   __ ld1r(v2.V16B(), bad_memory);
   __ ld1r(v22.V16B(), bad_memory);
   __ ld1r(v25.V1D(), bad_memory);
   __ ld1r(v9.V1D(), bad_memory);
   __ ld1r(v23.V1D(), bad_memory);
   __ ld1r(v19.V2D(), bad_memory);
   __ ld1r(v21.V2D(), bad_memory);
   __ ld1r(v30.V2D(), bad_memory);
   __ ld1r(v24.V2S(), bad_memory);
   __ ld1r(v26.V2S(), bad_memory);
   __ ld1r(v28.V2S(), bad_memory);
   __ ld1r(v19.V4H(), bad_memory);
   __ ld1r(v1.V4H(), bad_memory);
   __ ld1r(v21.V4H(), bad_memory);
   __ ld1r(v15.V4S(), bad_memory);
   __ ld1r(v21.V4S(), bad_memory);
   __ ld1r(v23.V4S(), bad_memory);
   __ ld1r(v26.V8B(), bad_memory);
   __ ld1r(v14.V8B(), bad_memory);
   __ ld1r(v19.V8B(), bad_memory);
   __ ld1r(v13.V8H(), bad_memory);
   __ ld1r(v30.V8H(), bad_memory);
   __ ld1r(v27.V8H(), bad_memory);
   __ ld2(v21.V16B(), v22.V16B(), bad_memory);
   __ ld2(v21.V16B(), v22.V16B(), bad_memory);
   __ ld2(v12.V16B(), v13.V16B(), bad_memory);
   __ ld2(v14.V2D(), v15.V2D(), bad_memory);
   __ ld2(v0.V2D(), v1.V2D(), bad_memory);
   __ ld2(v12.V2D(), v13.V2D(), bad_memory);
   __ ld2(v27.V2S(), v28.V2S(), bad_memory);
   __ ld2(v2.V2S(), v3.V2S(), bad_memory);
   __ ld2(v12.V2S(), v13.V2S(), bad_memory);
   __ ld2(v9.V4H(), v10.V4H(), bad_memory);
   __ ld2(v23.V4H(), v24.V4H(), bad_memory);
   __ ld2(v1.V4H(), v2.V4H(), bad_memory);
   __ ld2(v20.V4S(), v21.V4S(), bad_memory);
   __ ld2(v10.V4S(), v11.V4S(), bad_memory);
   __ ld2(v24.V4S(), v25.V4S(), bad_memory);
   __ ld2(v17.V8B(), v18.V8B(), bad_memory);
   __ ld2(v13.V8B(), v14.V8B(), bad_memory);
   __ ld2(v7.V8B(), v8.V8B(), bad_memory);
   __ ld2(v30.V8H(), v31.V8H(), bad_memory);
   __ ld2(v4.V8H(), v5.V8H(), bad_memory);
   __ ld2(v13.V8H(), v14.V8H(), bad_memory);
   __ ld2(v5.B(), v6.B(), 12, bad_memory);
   __ ld2(v16.B(), v17.B(), 7, bad_memory);
   __ ld2(v29.B(), v30.B(), 2, bad_memory);
   __ ld2(v11.D(), v12.D(), 1, bad_memory);
   __ ld2(v26.D(), v27.D(), 0, bad_memory);
   __ ld2(v25.D(), v26.D(), 0, bad_memory);
   __ ld2(v18.H(), v19.H(), 7, bad_memory);
   __ ld2(v17.H(), v18.H(), 5, bad_memory);
   __ ld2(v30.H(), v31.H(), 2, bad_memory);
   __ ld2(v29.S(), v30.S(), 3, bad_memory);
   __ ld2(v28.S(), v29.S(), 0, bad_memory);
   __ ld2(v6.S(), v7.S(), 1, bad_memory);
   __ ld2r(v26.V16B(), v27.V16B(), bad_memory);
   __ ld2r(v21.V16B(), v22.V16B(), bad_memory);
   __ ld2r(v5.V16B(), v6.V16B(), bad_memory);
   __ ld2r(v26.V1D(), v27.V1D(), bad_memory);
   __ ld2r(v14.V1D(), v15.V1D(), bad_memory);
   __ ld2r(v23.V1D(), v24.V1D(), bad_memory);
   __ ld2r(v11.V2D(), v12.V2D(), bad_memory);
   __ ld2r(v29.V2D(), v30.V2D(), bad_memory);
   __ ld2r(v15.V2D(), v16.V2D(), bad_memory);
   __ ld2r(v26.V2S(), v27.V2S(), bad_memory);
   __ ld2r(v22.V2S(), v23.V2S(), bad_memory);
   __ ld2r(v2.V2S(), v3.V2S(), bad_memory);
   __ ld2r(v2.V4H(), v3.V4H(), bad_memory);
   __ ld2r(v9.V4H(), v10.V4H(), bad_memory);
   __ ld2r(v6.V4H(), v7.V4H(), bad_memory);
   __ ld2r(v7.V4S(), v8.V4S(), bad_memory);
   __ ld2r(v19.V4S(), v20.V4S(), bad_memory);
   __ ld2r(v21.V4S(), v22.V4S(), bad_memory);
   __ ld2r(v26.V8B(), v27.V8B(), bad_memory);
   __ ld2r(v20.V8B(), v21.V8B(), bad_memory);
   __ ld2r(v11.V8B(), v12.V8B(), bad_memory);
   __ ld2r(v12.V8H(), v13.V8H(), bad_memory);
   __ ld2r(v6.V8H(), v7.V8H(), bad_memory);
   __ ld2r(v25.V8H(), v26.V8H(), bad_memory);
   __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), bad_memory);
   __ ld3(v28.V16B(), v29.V16B(), v30.V16B(), bad_memory);
   __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), bad_memory);
   __ ld3(v21.V2D(), v22.V2D(), v23.V2D(), bad_memory);
   __ ld3(v18.V2D(), v19.V2D(), v20.V2D(), bad_memory);
   __ ld3(v27.V2D(), v28.V2D(), v29.V2D(), bad_memory);
   __ ld3(v7.V2S(), v8.V2S(), v9.V2S(), bad_memory);
   __ ld3(v20.V2S(), v21.V2S(), v22.V2S(), bad_memory);
   __ ld3(v26.V2S(), v27.V2S(), v28.V2S(), bad_memory);
   __ ld3(v27.V4H(), v28.V4H(), v29.V4H(), bad_memory);
   __ ld3(v28.V4H(), v29.V4H(), v30.V4H(), bad_memory);
   __ ld3(v7.V4H(), v8.V4H(), v9.V4H(), bad_memory);
   __ ld3(v2.V4S(), v3.V4S(), v4.V4S(), bad_memory);
   __ ld3(v24.V4S(), v25.V4S(), v26.V4S(), bad_memory);
   __ ld3(v11.V4S(), v12.V4S(), v13.V4S(), bad_memory);
   __ ld3(v29.V8B(), v30.V8B(), v31.V8B(), bad_memory);
   __ ld3(v1.V8B(), v2.V8B(), v3.V8B(), bad_memory);
   __ ld3(v12.V8B(), v13.V8B(), v14.V8B(), bad_memory);
   __ ld3(v22.V8H(), v23.V8H(), v24.V8H(), bad_memory);
   __ ld3(v13.V8H(), v14.V8H(), v15.V8H(), bad_memory);
   __ ld3(v28.V8H(), v29.V8H(), v30.V8H(), bad_memory);
   __ ld3(v21.B(), v22.B(), v23.B(), 11, bad_memory);
   __ ld3(v5.B(), v6.B(), v7.B(), 9, bad_memory);
   __ ld3(v23.B(), v24.B(), v25.B(), 0, bad_memory);
   __ ld3(v16.D(), v17.D(), v18.D(), 0, bad_memory);
   __ ld3(v30.D(), v31.D(), v0.D(), 0, bad_memory);
   __ ld3(v28.D(), v29.D(), v30.D(), 1, bad_memory);
   __ ld3(v13.H(), v14.H(), v15.H(), 2, bad_memory);
   __ ld3(v22.H(), v23.H(), v24.H(), 7, bad_memory);
   __ ld3(v14.H(), v15.H(), v16.H(), 3, bad_memory);
   __ ld3(v22.S(), v23.S(), v24.S(), 3, bad_memory);
   __ ld3(v30.S(), v31.S(), v0.S(), 2, bad_memory);
   __ ld3(v12.S(), v13.S(), v14.S(), 1, bad_memory);
   __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), bad_memory);
   __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), bad_memory);
   __ ld3r(v3.V16B(), v4.V16B(), v5.V16B(), bad_memory);
   __ ld3r(v4.V1D(), v5.V1D(), v6.V1D(), bad_memory);
   __ ld3r(v7.V1D(), v8.V1D(), v9.V1D(), bad_memory);
   __ ld3r(v17.V1D(), v18.V1D(), v19.V1D(), bad_memory);
   __ ld3r(v16.V2D(), v17.V2D(), v18.V2D(), bad_memory);
   __ ld3r(v20.V2D(), v21.V2D(), v22.V2D(), bad_memory);
   __ ld3r(v14.V2D(), v15.V2D(), v16.V2D(), bad_memory);
   __ ld3r(v10.V2S(), v11.V2S(), v12.V2S(), bad_memory);
   __ ld3r(v0.V2S(), v1.V2S(), v2.V2S(), bad_memory);
   __ ld3r(v23.V2S(), v24.V2S(), v25.V2S(), bad_memory);
   __ ld3r(v22.V4H(), v23.V4H(), v24.V4H(), bad_memory);
   __ ld3r(v6.V4H(), v7.V4H(), v8.V4H(), bad_memory);
   __ ld3r(v7.V4H(), v8.V4H(), v9.V4H(), bad_memory);
   __ ld3r(v26.V4S(), v27.V4S(), v28.V4S(), bad_memory);
   __ ld3r(v0.V4S(), v1.V4S(), v2.V4S(), bad_memory);
   __ ld3r(v30.V4S(), v31.V4S(), v0.V4S(), bad_memory);
   __ ld3r(v2.V8B(), v3.V8B(), v4.V8B(), bad_memory);
   __ ld3r(v10.V8B(), v11.V8B(), v12.V8B(), bad_memory);
   __ ld3r(v28.V8B(), v29.V8B(), v30.V8B(), bad_memory);
   __ ld3r(v6.V8H(), v7.V8H(), v8.V8H(), bad_memory);
   __ ld3r(v29.V8H(), v30.V8H(), v31.V8H(), bad_memory);
   __ ld3r(v7.V8H(), v8.V8H(), v9.V8H(), bad_memory);
   __ ld4(v3.V16B(), v4.V16B(), v5.V16B(), v6.V16B(), bad_memory);
   __ ld4(v2.V16B(), v3.V16B(), v4.V16B(), v5.V16B(), bad_memory);
   __ ld4(v5.V16B(), v6.V16B(), v7.V16B(), v8.V16B(), bad_memory);
   __ ld4(v18.V2D(), v19.V2D(), v20.V2D(), v21.V2D(), bad_memory);
   __ ld4(v4.V2D(), v5.V2D(), v6.V2D(), v7.V2D(), bad_memory);
   __ ld4(v29.V2D(), v30.V2D(), v31.V2D(), v0.V2D(), bad_memory);
   __ ld4(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), bad_memory);
   __ ld4(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), bad_memory);
   __ ld4(v4.V2S(), v5.V2S(), v6.V2S(), v7.V2S(), bad_memory);
   __ ld4(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), bad_memory);
   __ ld4(v23.V4H(), v24.V4H(), v25.V4H(), v26.V4H(), bad_memory);
   __ ld4(v2.V4H(), v3.V4H(), v4.V4H(), v5.V4H(), bad_memory);
   __ ld4(v7.V4S(), v8.V4S(), v9.V4S(), v10.V4S(), bad_memory);
   __ ld4(v28.V4S(), v29.V4S(), v30.V4S(), v31.V4S(), bad_memory);
   __ ld4(v29.V4S(), v30.V4S(), v31.V4S(), v0.V4S(), bad_memory);
   __ ld4(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), bad_memory);
   __ ld4(v27.V8B(), v28.V8B(), v29.V8B(), v30.V8B(), bad_memory);
   __ ld4(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), bad_memory);
   __ ld4(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), bad_memory);
   __ ld4(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), bad_memory);
   __ ld4(v20.V8H(), v21.V8H(), v22.V8H(), v23.V8H(), bad_memory);
   __ ld4(v20.B(), v21.B(), v22.B(), v23.B(), 3, bad_memory);
   __ ld4(v12.B(), v13.B(), v14.B(), v15.B(), 3, bad_memory);
   __ ld4(v27.B(), v28.B(), v29.B(), v30.B(), 6, bad_memory);
   __ ld4(v28.D(), v29.D(), v30.D(), v31.D(), 1, bad_memory);
   __ ld4(v15.D(), v16.D(), v17.D(), v18.D(), 1, bad_memory);
   __ ld4(v16.D(), v17.D(), v18.D(), v19.D(), 1, bad_memory);
   __ ld4(v2.H(), v3.H(), v4.H(), v5.H(), 6, bad_memory);
   __ ld4(v5.H(), v6.H(), v7.H(), v8.H(), 3, bad_memory);
   __ ld4(v7.H(), v8.H(), v9.H(), v10.H(), 6, bad_memory);
   __ ld4(v6.S(), v7.S(), v8.S(), v9.S(), 1, bad_memory);
   __ ld4(v25.S(), v26.S(), v27.S(), v28.S(), 2, bad_memory);
   __ ld4(v8.S(), v9.S(), v10.S(), v11.S(), 3, bad_memory);
   __ ld4r(v14.V16B(), v15.V16B(), v16.V16B(), v17.V16B(), bad_memory);
   __ ld4r(v13.V16B(), v14.V16B(), v15.V16B(), v16.V16B(), bad_memory);
   __ ld4r(v9.V16B(), v10.V16B(), v11.V16B(), v12.V16B(), bad_memory);
   __ ld4r(v8.V1D(), v9.V1D(), v10.V1D(), v11.V1D(), bad_memory);
   __ ld4r(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), bad_memory);
   __ ld4r(v26.V1D(), v27.V1D(), v28.V1D(), v29.V1D(), bad_memory);
   __ ld4r(v19.V2D(), v20.V2D(), v21.V2D(), v22.V2D(), bad_memory);
   __ ld4r(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), bad_memory);
   __ ld4r(v15.V2D(), v16.V2D(), v17.V2D(), v18.V2D(), bad_memory);
   __ ld4r(v31.V2S(), v0.V2S(), v1.V2S(), v2.V2S(), bad_memory);
   __ ld4r(v28.V2S(), v29.V2S(), v30.V2S(), v31.V2S(), bad_memory);
   __ ld4r(v11.V2S(), v12.V2S(), v13.V2S(), v14.V2S(), bad_memory);
   __ ld4r(v19.V4H(), v20.V4H(), v21.V4H(), v22.V4H(), bad_memory);
   __ ld4r(v22.V4H(), v23.V4H(), v24.V4H(), v25.V4H(), bad_memory);
   __ ld4r(v20.V4H(), v21.V4H(), v22.V4H(), v23.V4H(), bad_memory);
   __ ld4r(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), bad_memory);
   __ ld4r(v25.V4S(), v26.V4S(), v27.V4S(), v28.V4S(), bad_memory);
   __ ld4r(v23.V4S(), v24.V4S(), v25.V4S(), v26.V4S(), bad_memory);
   __ ld4r(v22.V8B(), v23.V8B(), v24.V8B(), v25.V8B(), bad_memory);
   __ ld4r(v27.V8B(), v28.V8B(), v29.V8B(), v30.V8B(), bad_memory);
   __ ld4r(v29.V8B(), v30.V8B(), v31.V8B(), v0.V8B(), bad_memory);
   __ ld4r(v28.V8H(), v29.V8H(), v30.V8H(), v31.V8H(), bad_memory);
   __ ld4r(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), bad_memory);
   __ ld4r(v22.V8H(), v23.V8H(), v24.V8H(), v25.V8H(), bad_memory);

   __ st1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), bad_memory);
   __ st1(v10.V16B(), v11.V16B(), v12.V16B(), v13.V16B(), bad_memory);
   __ st1(v27.V16B(), v28.V16B(), v29.V16B(), v30.V16B(), bad_memory);
   __ st1(v16.V16B(), v17.V16B(), v18.V16B(), bad_memory);
   __ st1(v21.V16B(), v22.V16B(), v23.V16B(), bad_memory);
   __ st1(v9.V16B(), v10.V16B(), v11.V16B(), bad_memory);
   __ st1(v7.V16B(), v8.V16B(), bad_memory);
   __ st1(v26.V16B(), v27.V16B(), bad_memory);
   __ st1(v22.V16B(), v23.V16B(), bad_memory);
   __ st1(v23.V16B(), bad_memory);
   __ st1(v28.V16B(), bad_memory);
   __ st1(v2.V16B(), bad_memory);
   __ st1(v29.V1D(), v30.V1D(), v31.V1D(), v0.V1D(), bad_memory);
   __ st1(v12.V1D(), v13.V1D(), v14.V1D(), v15.V1D(), bad_memory);
   __ st1(v30.V1D(), v31.V1D(), v0.V1D(), v1.V1D(), bad_memory);
   __ st1(v16.V1D(), v17.V1D(), v18.V1D(), bad_memory);
   __ st1(v3.V1D(), v4.V1D(), v5.V1D(), bad_memory);
   __ st1(v14.V1D(), v15.V1D(), v16.V1D(), bad_memory);
   __ st1(v18.V1D(), v19.V1D(), bad_memory);
   __ st1(v5.V1D(), v6.V1D(), bad_memory);
   __ st1(v2.V1D(), v3.V1D(), bad_memory);
   __ st1(v4.V1D(), bad_memory);
   __ st1(v27.V1D(), bad_memory);
   __ st1(v23.V1D(), bad_memory);
   __ st1(v2.V2D(), v3.V2D(), v4.V2D(), v5.V2D(), bad_memory);
   __ st1(v22.V2D(), v23.V2D(), v24.V2D(), v25.V2D(), bad_memory);
   __ st1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), bad_memory);
   __ st1(v17.V2D(), v18.V2D(), v19.V2D(), bad_memory);
   __ st1(v16.V2D(), v17.V2D(), v18.V2D(), bad_memory);
   __ st1(v22.V2D(), v23.V2D(), v24.V2D(), bad_memory);
   __ st1(v21.V2D(), v22.V2D(), bad_memory);
   __ st1(v6.V2D(), v7.V2D(), bad_memory);
   __ st1(v27.V2D(), v28.V2D(), bad_memory);
   __ st1(v21.V2D(), bad_memory);
   __ st1(v29.V2D(), bad_memory);
   __ st1(v20.V2D(), bad_memory);
   __ st1(v22.V2S(), v23.V2S(), v24.V2S(), v25.V2S(), bad_memory);
   __ st1(v8.V2S(), v9.V2S(), v10.V2S(), v11.V2S(), bad_memory);
   __ st1(v15.V2S(), v16.V2S(), v17.V2S(), v18.V2S(), bad_memory);
   __ st1(v2.V2S(), v3.V2S(), v4.V2S(), bad_memory);
   __ st1(v23.V2S(), v24.V2S(), v25.V2S(), bad_memory);
   __ st1(v7.V2S(), v8.V2S(), v9.V2S(), bad_memory);
   __ st1(v28.V2S(), v29.V2S(), bad_memory);
   __ st1(v29.V2S(), v30.V2S(), bad_memory);
   __ st1(v23.V2S(), v24.V2S(), bad_memory);
   __ st1(v6.V2S(), bad_memory);
   __ st1(v11.V2S(), bad_memory);
   __ st1(v17.V2S(), bad_memory);
   __ st1(v6.V4H(), v7.V4H(), v8.V4H(), v9.V4H(), bad_memory);
   __ st1(v9.V4H(), v10.V4H(), v11.V4H(), v12.V4H(), bad_memory);
   __ st1(v25.V4H(), v26.V4H(), v27.V4H(), v28.V4H(), bad_memory);
   __ st1(v11.V4H(), v12.V4H(), v13.V4H(), bad_memory);
   __ st1(v10.V4H(), v11.V4H(), v12.V4H(), bad_memory);
   __ st1(v12.V4H(), v13.V4H(), v14.V4H(), bad_memory);
   __ st1(v13.V4H(), v14.V4H(), bad_memory);
   __ st1(v15.V4H(), v16.V4H(), bad_memory);
   __ st1(v21.V4H(), v22.V4H(), bad_memory);
   __ st1(v16.V4H(), bad_memory);
   __ st1(v8.V4H(), bad_memory);
   __ st1(v30.V4H(), bad_memory);
   __ st1(v3.V4S(), v4.V4S(), v5.V4S(), v6.V4S(), bad_memory);
   __ st1(v25.V4S(), v26.V4S(), v27.V4S(), v28.V4S(), bad_memory);
   __ st1(v5.V4S(), v6.V4S(), v7.V4S(), v8.V4S(), bad_memory);
   __ st1(v31.V4S(), v0.V4S(), v1.V4S(), bad_memory);
   __ st1(v30.V4S(), v31.V4S(), v0.V4S(), bad_memory);
   __ st1(v6.V4S(), v7.V4S(), v8.V4S(), bad_memory);
   __ st1(v17.V4S(), v18.V4S(), bad_memory);
   __ st1(v31.V4S(), v0.V4S(), bad_memory);
   __ st1(v1.V4S(), v2.V4S(), bad_memory);
   __ st1(v26.V4S(), bad_memory);
   __ st1(v15.V4S(), bad_memory);
   __ st1(v13.V4S(), bad_memory);
   __ st1(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), bad_memory);
   __ st1(v10.V8B(), v11.V8B(), v12.V8B(), v13.V8B(), bad_memory);
   __ st1(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), bad_memory);
   __ st1(v19.V8B(), v20.V8B(), v21.V8B(), bad_memory);
   __ st1(v31.V8B(), v0.V8B(), v1.V8B(), bad_memory);
   __ st1(v9.V8B(), v10.V8B(), v11.V8B(), bad_memory);
   __ st1(v12.V8B(), v13.V8B(), bad_memory);
   __ st1(v2.V8B(), v3.V8B(), bad_memory);
   __ st1(v0.V8B(), v1.V8B(), bad_memory);
   __ st1(v16.V8B(), bad_memory);
   __ st1(v25.V8B(), bad_memory);
   __ st1(v31.V8B(), bad_memory);
   __ st1(v4.V8H(), v5.V8H(), v6.V8H(), v7.V8H(), bad_memory);
   __ st1(v3.V8H(), v4.V8H(), v5.V8H(), v6.V8H(), bad_memory);
   __ st1(v26.V8H(), v27.V8H(), v28.V8H(), v29.V8H(), bad_memory);
   __ st1(v10.V8H(), v11.V8H(), v12.V8H(), bad_memory);
   __ st1(v21.V8H(), v22.V8H(), v23.V8H(), bad_memory);
   __ st1(v18.V8H(), v19.V8H(), v20.V8H(), bad_memory);
   __ st1(v26.V8H(), v27.V8H(), bad_memory);
   __ st1(v24.V8H(), v25.V8H(), bad_memory);
   __ st1(v17.V8H(), v18.V8H(), bad_memory);
   __ st1(v29.V8H(), bad_memory);
   __ st1(v19.V8H(), bad_memory);
   __ st1(v23.V8H(), bad_memory);
   __ st1(v19.B(), 15, bad_memory);
   __ st1(v25.B(), 9, bad_memory);
   __ st1(v4.B(), 8, bad_memory);
   __ st1(v13.D(), 0, bad_memory);
   __ st1(v30.D(), 0, bad_memory);
   __ st1(v3.D(), 0, bad_memory);
   __ st1(v22.H(), 0, bad_memory);
   __ st1(v31.H(), 7, bad_memory);
   __ st1(v23.H(), 3, bad_memory);
   __ st1(v0.S(), 0, bad_memory);
   __ st1(v11.S(), 3, bad_memory);
   __ st1(v24.S(), 3, bad_memory);
   __ st2(v7.V16B(), v8.V16B(), bad_memory);
   __ st2(v5.V16B(), v6.V16B(), bad_memory);
   __ st2(v18.V16B(), v19.V16B(), bad_memory);
   __ st2(v14.V2D(), v15.V2D(), bad_memory);
   __ st2(v7.V2D(), v8.V2D(), bad_memory);
   __ st2(v24.V2D(), v25.V2D(), bad_memory);
   __ st2(v22.V2S(), v23.V2S(), bad_memory);
   __ st2(v4.V2S(), v5.V2S(), bad_memory);
   __ st2(v2.V2S(), v3.V2S(), bad_memory);
   __ st2(v23.V4H(), v24.V4H(), bad_memory);
   __ st2(v8.V4H(), v9.V4H(), bad_memory);
   __ st2(v7.V4H(), v8.V4H(), bad_memory);
   __ st2(v17.V4S(), v18.V4S(), bad_memory);
   __ st2(v6.V4S(), v7.V4S(), bad_memory);
   __ st2(v26.V4S(), v27.V4S(), bad_memory);
   __ st2(v31.V8B(), v0.V8B(), bad_memory);
   __ st2(v0.V8B(), v1.V8B(), bad_memory);
   __ st2(v21.V8B(), v22.V8B(), bad_memory);
   __ st2(v7.V8H(), v8.V8H(), bad_memory);
   __ st2(v22.V8H(), v23.V8H(), bad_memory);
   __ st2(v4.V8H(), v5.V8H(), bad_memory);
   __ st2(v8.B(), v9.B(), 15, bad_memory);
   __ st2(v8.B(), v9.B(), 15, bad_memory);
   __ st2(v7.B(), v8.B(), 4, bad_memory);
   __ st2(v25.D(), v26.D(), 0, bad_memory);
   __ st2(v17.D(), v18.D(), 1, bad_memory);
   __ st2(v3.D(), v4.D(), 1, bad_memory);
   __ st2(v4.H(), v5.H(), 3, bad_memory);
   __ st2(v0.H(), v1.H(), 5, bad_memory);
   __ st2(v22.H(), v23.H(), 2, bad_memory);
   __ st2(v14.S(), v15.S(), 3, bad_memory);
   __ st2(v23.S(), v24.S(), 3, bad_memory);
   __ st2(v0.S(), v1.S(), 2, bad_memory);
   __ st3(v26.V16B(), v27.V16B(), v28.V16B(), bad_memory);
   __ st3(v21.V16B(), v22.V16B(), v23.V16B(), bad_memory);
   __ st3(v24.V16B(), v25.V16B(), v26.V16B(), bad_memory);
   __ st3(v17.V2D(), v18.V2D(), v19.V2D(), bad_memory);
   __ st3(v23.V2D(), v24.V2D(), v25.V2D(), bad_memory);
   __ st3(v10.V2D(), v11.V2D(), v12.V2D(), bad_memory);
   __ st3(v9.V2S(), v10.V2S(), v11.V2S(), bad_memory);
   __ st3(v13.V2S(), v14.V2S(), v15.V2S(), bad_memory);
   __ st3(v22.V2S(), v23.V2S(), v24.V2S(), bad_memory);
   __ st3(v31.V4H(), v0.V4H(), v1.V4H(), bad_memory);
   __ st3(v8.V4H(), v9.V4H(), v10.V4H(), bad_memory);
   __ st3(v19.V4H(), v20.V4H(), v21.V4H(), bad_memory);
   __ st3(v18.V4S(), v19.V4S(), v20.V4S(), bad_memory);
   __ st3(v25.V4S(), v26.V4S(), v27.V4S(), bad_memory);
   __ st3(v16.V4S(), v17.V4S(), v18.V4S(), bad_memory);
   __ st3(v27.V8B(), v28.V8B(), v29.V8B(), bad_memory);
   __ st3(v29.V8B(), v30.V8B(), v31.V8B(), bad_memory);
   __ st3(v30.V8B(), v31.V8B(), v0.V8B(), bad_memory);
   __ st3(v8.V8H(), v9.V8H(), v10.V8H(), bad_memory);
   __ st3(v18.V8H(), v19.V8H(), v20.V8H(), bad_memory);
   __ st3(v18.V8H(), v19.V8H(), v20.V8H(), bad_memory);
   __ st3(v31.B(), v0.B(), v1.B(), 10, bad_memory);
   __ st3(v4.B(), v5.B(), v6.B(), 5, bad_memory);
   __ st3(v5.B(), v6.B(), v7.B(), 1, bad_memory);
   __ st3(v5.D(), v6.D(), v7.D(), 0, bad_memory);
   __ st3(v6.D(), v7.D(), v8.D(), 0, bad_memory);
   __ st3(v0.D(), v1.D(), v2.D(), 0, bad_memory);
   __ st3(v31.H(), v0.H(), v1.H(), 2, bad_memory);
   __ st3(v14.H(), v15.H(), v16.H(), 5, bad_memory);
   __ st3(v21.H(), v22.H(), v23.H(), 6, bad_memory);
   __ st3(v21.S(), v22.S(), v23.S(), 0, bad_memory);
   __ st3(v11.S(), v12.S(), v13.S(), 1, bad_memory);
   __ st3(v15.S(), v16.S(), v17.S(), 0, bad_memory);
   __ st4(v22.V16B(), v23.V16B(), v24.V16B(), v25.V16B(), bad_memory);
   __ st4(v24.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), bad_memory);
   __ st4(v15.V16B(), v16.V16B(), v17.V16B(), v18.V16B(), bad_memory);
   __ st4(v16.V2D(), v17.V2D(), v18.V2D(), v19.V2D(), bad_memory);
   __ st4(v17.V2D(), v18.V2D(), v19.V2D(), v20.V2D(), bad_memory);
   __ st4(v9.V2D(), v10.V2D(), v11.V2D(), v12.V2D(), bad_memory);
   __ st4(v23.V2S(), v24.V2S(), v25.V2S(), v26.V2S(), bad_memory);
   __ st4(v15.V2S(), v16.V2S(), v17.V2S(), v18.V2S(), bad_memory);
   __ st4(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), bad_memory);
   __ st4(v14.V4H(), v15.V4H(), v16.V4H(), v17.V4H(), bad_memory);
   __ st4(v18.V4H(), v19.V4H(), v20.V4H(), v21.V4H(), bad_memory);
   __ st4(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), bad_memory);
   __ st4(v13.V4S(), v14.V4S(), v15.V4S(), v16.V4S(), bad_memory);
   __ st4(v6.V4S(), v7.V4S(), v8.V4S(), v9.V4S(), bad_memory);
   __ st4(v15.V4S(), v16.V4S(), v17.V4S(), v18.V4S(), bad_memory);
   __ st4(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), bad_memory);
   __ st4(v25.V8B(), v26.V8B(), v27.V8B(), v28.V8B(), bad_memory);
   __ st4(v19.V8B(), v20.V8B(), v21.V8B(), v22.V8B(), bad_memory);
   __ st4(v19.V8H(), v20.V8H(), v21.V8H(), v22.V8H(), bad_memory);
   __ st4(v15.V8H(), v16.V8H(), v17.V8H(), v18.V8H(), bad_memory);
   __ st4(v31.V8H(), v0.V8H(), v1.V8H(), v2.V8H(), bad_memory);
   __ st4(v0.B(), v1.B(), v2.B(), v3.B(), 13, bad_memory);
   __ st4(v4.B(), v5.B(), v6.B(), v7.B(), 10, bad_memory);
   __ st4(v9.B(), v10.B(), v11.B(), v12.B(), 9, bad_memory);
   __ st4(v2.D(), v3.D(), v4.D(), v5.D(), 1, bad_memory);
   __ st4(v7.D(), v8.D(), v9.D(), v10.D(), 0, bad_memory);
   __ st4(v31.D(), v0.D(), v1.D(), v2.D(), 1, bad_memory);
   __ st4(v2.H(), v3.H(), v4.H(), v5.H(), 1, bad_memory);
   __ st4(v27.H(), v28.H(), v29.H(), v30.H(), 3, bad_memory);
   __ st4(v24.H(), v25.H(), v26.H(), v27.H(), 4, bad_memory);
   __ st4(v18.S(), v19.S(), v20.S(), v21.S(), 2, bad_memory);
   __ st4(v6.S(), v7.S(), v8.S(), v9.S(), 2, bad_memory);
   __ st4(v25.S(), v26.S(), v27.S(), v28.S(), 1, bad_memory);

   END_IMPLICIT_CHECK();
   TRY_RUN_IMPLICIT_CHECK();
 }

 TEST(ImplicitCheckSve) {
   SETUP_WITH_FEATURES(CPUFeatures::kSVE,
                       CPUFeatures::kSVE2,
                       CPUFeatures::kNEON);
   START_IMPLICIT_CHECK();

   SVEMemOperand bad_sve_memory = SVEMemOperand(ip0);

   EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
   // Simple, unpredicated loads and stores.
   __ Str(p12.VnD(), bad_sve_memory);
   __ Str(p13.VnS(), bad_sve_memory);
   __ Str(p14.VnH(), bad_sve_memory);
   __ Str(p15.VnB(), bad_sve_memory);
   __ Ldr(p8.VnD(), bad_sve_memory);
   __ Ldr(p9.VnS(), bad_sve_memory);
   __ Ldr(p10.VnH(), bad_sve_memory);
   __ Ldr(p11.VnB(), bad_sve_memory);

   __ Str(z0.VnD(), bad_sve_memory);
   __ Str(z1.VnS(), bad_sve_memory);
   __ Str(z2.VnH(), bad_sve_memory);
   __ Str(z3.VnB(), bad_sve_memory);
   __ Ldr(z20.VnD(), bad_sve_memory);
   __ Ldr(z21.VnS(), bad_sve_memory);
   __ Ldr(z22.VnH(), bad_sve_memory);
   __ Ldr(z23.VnB(), bad_sve_memory);

   // Structured accesses.
   __ St1b(z0.VnB(), p2, bad_sve_memory);
   __ St1h(z1.VnH(), p1, bad_sve_memory);
   __ St1w(z2.VnS(), p1, bad_sve_memory);
   __ St1d(z3.VnD(), p2, bad_sve_memory);
   __ Ld1b(z20.VnB(), p1.Zeroing(), bad_sve_memory);
   __ Ld1h(z21.VnH(), p2.Zeroing(), bad_sve_memory);
   __ Ld1w(z22.VnS(), p1.Zeroing(), bad_sve_memory);
   __ Ld1d(z23.VnD(), p1.Zeroing(), bad_sve_memory);

   // Structured, packed accesses.
   __ St1b(z2.VnH(), p1, bad_sve_memory);
   __ St1b(z3.VnS(), p2, bad_sve_memory);
   __ St1b(z4.VnD(), p2, bad_sve_memory);
   __ St1h(z0.VnS(), p1, bad_sve_memory);
   __ St1h(z1.VnD(), p1, bad_sve_memory);
   __ St1w(z2.VnD(), p1, bad_sve_memory);
   __ Ld1b(z20.VnH(), p1.Zeroing(), bad_sve_memory);
   __ Ld1b(z21.VnS(), p1.Zeroing(), bad_sve_memory);
   __ Ld1b(z22.VnD(), p1.Zeroing(), bad_sve_memory);
   __ Ld1h(z23.VnS(), p2.Zeroing(), bad_sve_memory);
   __ Ld1h(z24.VnD(), p2.Zeroing(), bad_sve_memory);
   __ Ld1w(z20.VnD(), p1.Zeroing(), bad_sve_memory);
   __ Ld1sb(z21.VnH(), p1.Zeroing(), bad_sve_memory);
   __ Ld1sb(z22.VnS(), p1.Zeroing(), bad_sve_memory);
   __ Ld1sb(z23.VnD(), p2.Zeroing(), bad_sve_memory);
   __ Ld1sh(z24.VnS(), p2.Zeroing(), bad_sve_memory);
   __ Ld1sh(z20.VnD(), p1.Zeroing(), bad_sve_memory);
   __ Ld1sw(z21.VnD(), p1.Zeroing(), bad_sve_memory);

   // Structured, interleaved accesses.
   __ St2b(z0.VnB(), z1.VnB(), p4, bad_sve_memory);
   __ St2h(z1.VnH(), z2.VnH(), p4, bad_sve_memory);
   __ St2w(z2.VnS(), z3.VnS(), p3, bad_sve_memory);
   __ St2d(z3.VnD(), z4.VnD(), p4, bad_sve_memory);
   __ Ld2b(z20.VnB(), z21.VnB(), p5.Zeroing(), bad_sve_memory);
   __ Ld2h(z21.VnH(), z22.VnH(), p6.Zeroing(), bad_sve_memory);
   __ Ld2w(z22.VnS(), z23.VnS(), p6.Zeroing(), bad_sve_memory);
   __ Ld2d(z23.VnD(), z24.VnD(), p5.Zeroing(), bad_sve_memory);

   __ St3b(z4.VnB(), z5.VnB(), z6.VnB(), p4, bad_sve_memory);
   __ St3h(z5.VnH(), z6.VnH(), z7.VnH(), p4, bad_sve_memory);
   __ St3w(z6.VnS(), z7.VnS(), z8.VnS(), p3, bad_sve_memory);
   __ St3d(z7.VnD(), z8.VnD(), z9.VnD(), p4, bad_sve_memory);
   __ Ld3b(z24.VnB(), z25.VnB(), z26.VnB(), p5.Zeroing(), bad_sve_memory);
   __ Ld3h(z25.VnH(), z26.VnH(), z27.VnH(), p6.Zeroing(), bad_sve_memory);
   __ Ld3w(z26.VnS(), z27.VnS(), z28.VnS(), p6.Zeroing(), bad_sve_memory);
   __ Ld3d(z27.VnD(), z28.VnD(), z29.VnD(), p5.Zeroing(), bad_sve_memory);

   __ St4b(z31.VnB(), z0.VnB(), z1.VnB(), z2.VnB(), p4, bad_sve_memory);
   __ St4h(z0.VnH(), z1.VnH(), z2.VnH(), z3.VnH(), p4, bad_sve_memory);
   __ St4w(z1.VnS(), z2.VnS(), z3.VnS(), z4.VnS(), p3, bad_sve_memory);
   __ St4d(z2.VnD(), z3.VnD(), z4.VnD(), z5.VnD(), p4, bad_sve_memory);
   __ Ld4b(z25.VnB(),
           z26.VnB(),
           z27.VnB(),
           z28.VnB(),
           p5.Zeroing(),
           bad_sve_memory);
   __ Ld4h(z26.VnH(),
           z27.VnH(),
           z28.VnH(),
           z29.VnH(),
           p6.Zeroing(),
           bad_sve_memory);
   __ Ld4w(z27.VnS(),
           z28.VnS(),
           z29.VnS(),
           z30.VnS(),
           p6.Zeroing(),
           bad_sve_memory);
   __ Ld4d(z28.VnD(),
           z29.VnD(),
           z30.VnD(),
           z31.VnD(),
           p5.Zeroing(),
           bad_sve_memory);

   END_IMPLICIT_CHECK();
   TRY_RUN_IMPLICIT_CHECK();
 }

 TEST(ImplicitCheckAtomics) {
   SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kAtomics);
   START_IMPLICIT_CHECK();

   EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
 #define INST_LIST(OP)                 \
   __ Ld##OP##b(w0, w0, bad_memory);   \
   __ Ld##OP##ab(w0, w1, bad_memory);  \
   __ Ld##OP##lb(w0, w2, bad_memory);  \
   __ Ld##OP##alb(w0, w3, bad_memory); \
   __ Ld##OP##h(w0, w0, bad_memory);   \
   __ Ld##OP##ah(w0, w1, bad_memory);  \
   __ Ld##OP##lh(w0, w2, bad_memory);  \
   __ Ld##OP##alh(w0, w3, bad_memory); \
   __ Ld##OP(w0, w0, bad_memory);      \
   __ Ld##OP##a(w0, w1, bad_memory);   \
   __ Ld##OP##l(w0, w2, bad_memory);   \
   __ Ld##OP##al(w0, w3, bad_memory);  \
   __ Ld##OP(x0, x0, bad_memory);      \
   __ Ld##OP##a(x0, x1, bad_memory);   \
   __ Ld##OP##l(x0, x2, bad_memory);   \
   __ Ld##OP##al(x0, x3, bad_memory);  \
   __ St##OP##b(w0, bad_memory);       \
   __ St##OP##lb(w0, bad_memory);      \
   __ St##OP##h(w0, bad_memory);       \
   __ St##OP##lh(w0, bad_memory);      \
   __ St##OP(w0, bad_memory);          \
   __ St##OP##l(w0, bad_memory);       \
   __ St##OP(x0, bad_memory);          \
   __ St##OP##l(x0, bad_memory);

   INST_LIST(add);
   INST_LIST(set);
   INST_LIST(eor);
   INST_LIST(smin);
   INST_LIST(smax);
   INST_LIST(umin);
   INST_LIST(umax);
   INST_LIST(clr);

 #undef INST_LIST

   END_IMPLICIT_CHECK();
   TRY_RUN_IMPLICIT_CHECK();
 }

 TEST(ImplicitCheckMops) {
   SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kMOPS);
   START_IMPLICIT_CHECK();

   EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
   __ Set(x15, ip1, ip0);
   __ Setn(x15, ip1, ip0);
   __ Setg(x15, ip1, ip0);
   __ Setgn(x15, ip1, ip0);

   __ Cpy(x15, ip0, ip1);
   __ Cpyn(x15, ip0, ip1);
   __ Cpyrn(x15, ip0, ip1);
   __ Cpywn(x15, ip0, ip1);
   __ Cpyf(x15, ip0, ip1);
   __ Cpyfn(x15, ip0, ip1);
   __ Cpyfrn(x15, ip0, ip1);
   __ Cpyfwn(x15, ip0, ip1);

   // The macro-assembler expands each instruction into prologue, main and
   // epilogue instructions where only the main instruction will fail. Increase
   // the counter to account for those additional instructions and the following
   // instructions.
   __ Mov(x0, 3);
   __ Mul(x1, x1, x0);
   __ Add(x1, x1, x0);

   END_IMPLICIT_CHECK();
   TRY_RUN_IMPLICIT_CHECK();
 }
 #endif  // VIXL_ENABLE_IMPLICIT_CHECKS

 #undef __
 #define __ masm->

 #if defined(VIXL_INCLUDE_SIMULATOR_AARCH64) &&                 \
     defined(VIXL_HAS_ABI_SUPPORT) && __cplusplus >= 201103L && \
     (defined(__clang__) || GCC_VERSION_OR_NEWER(4, 9, 1))

 // Generate a function that stores zero to a hard-coded address.
 Instruction* GenerateStoreZero(MacroAssembler* masm, int32_t* target) {
   masm->Reset();

   UseScratchRegisterScope temps(masm);
   Register temp = temps.AcquireX();
   __ Mov(temp, reinterpret_cast<intptr_t>(target));
   __ Str(wzr, MemOperand(temp));
   __ Ret();

   masm->FinalizeCode();
   return masm->GetBuffer()->GetStartAddress<Instruction*>();
 }


 // Generate a function that stores the `int32_t` argument to a hard-coded
 // address.
 // In this example and the other below, we use the `abi` object to retrieve
 // argument and return locations even though we could easily hard code them.
 // This mirrors how more generic code (e.g. templated) user would use these
 // mechanisms.
 Instruction* GenerateStoreInput(MacroAssembler* masm, int32_t* target) {
   masm->Reset();

   ABI abi;
   Register input =
       Register(abi.GetNextParameterGenericOperand<int32_t>().GetCPURegister());

   UseScratchRegisterScope temps(masm);
   Register temp = temps.AcquireX();
   __ Mov(temp, reinterpret_cast<intptr_t>(target));
   __ Str(input, MemOperand(temp));
   __ Ret();

   masm->FinalizeCode();
   return masm->GetBuffer()->GetStartAddress<Instruction*>();
 }


 // A minimal implementation of a `pow` function.
 Instruction* GeneratePow(MacroAssembler* masm, unsigned pow) {
   masm->Reset();

   ABI abi;
   Register input =
       Register(abi.GetNextParameterGenericOperand<int64_t>().GetCPURegister());
   Register result =
       Register(abi.GetReturnGenericOperand<int64_t>().GetCPURegister());
   UseScratchRegisterScope temps(masm);
   Register temp = temps.AcquireX();

   __ Mov(temp, 1);
   for (unsigned i = 0; i < pow; i++) {
     __ Mul(temp, temp, input);
   }
   __ Mov(result, temp);
   __ Ret();

   masm->FinalizeCode();
   return masm->GetBuffer()->GetStartAddress<Instruction*>();
 }


 Instruction* GenerateSum(MacroAssembler* masm) {
   masm->Reset();

   ABI abi;
   VRegister input_1 =
       VRegister(abi.GetNextParameterGenericOperand<float>().GetCPURegister());
   Register input_2 =
       Register(abi.GetNextParameterGenericOperand<int64_t>().GetCPURegister());
   VRegister input_3 =
       VRegister(abi.GetNextParameterGenericOperand<double>().GetCPURegister());
   VRegister result =
       VRegister(abi.GetReturnGenericOperand<double>().GetCPURegister());

   UseScratchRegisterScope temps(masm);
   VRegister temp = temps.AcquireD();

   __ Fcvt(input_1.D(), input_1);
   __ Scvtf(temp, input_2);
   __ Fadd(temp, temp, input_1.D());
   __ Fadd(result, temp, input_3);
   __ Ret();

   masm->FinalizeCode();
   return masm->GetBuffer()->GetStartAddress<Instruction*>();
 }


 TEST(RunFrom) {
   SETUP_WITH_FEATURES(CPUFeatures::kFP);

   // Run a function returning `void` and taking no argument.
   int32_t value = 0xbad;
   simulator.RunFrom(GenerateStoreZero(&masm, &value));
   VIXL_CHECK(value == 0);

   // Run a function returning `void` and taking one argument.
   int32_t argument = 0xf00d;
   simulator.RunFrom<void, int32_t>(GenerateStoreInput(&masm, &value), argument);
   VIXL_CHECK(value == 0xf00d);

   // Run a function taking one argument and returning a value.
   int64_t res_int64_t;
   res_int64_t =
       simulator.RunFrom<int64_t, int64_t>(GeneratePow(&masm, 0), 0xbad);
   VIXL_CHECK(res_int64_t == 1);
   res_int64_t = simulator.RunFrom<int64_t, int64_t>(GeneratePow(&masm, 1), 123);
   VIXL_CHECK(res_int64_t == 123);
   res_int64_t = simulator.RunFrom<int64_t, int64_t>(GeneratePow(&masm, 10), 2);
   VIXL_CHECK(res_int64_t == 1024);

   // Run a function taking multiple arguments in registers.
   double res_double =
       simulator.RunFrom<double, float, int64_t, double>(GenerateSum(&masm),
                                                         1.0,
                                                         2,
                                                         3.0);
   VIXL_CHECK(res_double == 6.0);
 }

 #endif


 }  // namespace aarch64
 }  // namespace vixl