src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp - toolchain/jdk/jdk21 - Git at Google

 /*
  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  *
  */

 #include "precompiled.hpp"
 #include "asm/assembler.hpp"
 #include "asm/assembler.inline.hpp"
 #include "opto/c2_MacroAssembler.hpp"
 #include "opto/compile.hpp"
 #include "opto/intrinsicnode.hpp"
 #include "opto/matcher.hpp"
 #include "opto/output.hpp"
 #include "opto/subnode.hpp"
 #include "runtime/stubRoutines.hpp"

 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) /* nothing */
 #define STOP(error) stop(error)
 #else
 #define BLOCK_COMMENT(str) block_comment(str)
 #define STOP(error) block_comment(error); stop(error)
 #endif

 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")

 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);

 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
                                   Register tmp2Reg, Register tmp3Reg) {
   Register oop = objectReg;
   Register box = boxReg;
   Register disp_hdr = tmpReg;
   Register tmp = tmp2Reg;
   Label cont;
   Label object_has_monitor;
   Label count, no_count;

   assert_different_registers(oop, box, tmp, disp_hdr);

   // Load markWord from object into displaced_header.
   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));

   if (DiagnoseSyncOnValueBasedClasses != 0) {
     load_klass(tmp, oop);
     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
     br(Assembler::NE, cont);
   }

   // Check for existing monitor
   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);

   if (LockingMode == LM_MONITOR) {
     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
     b(cont);
   } else if (LockingMode == LM_LEGACY) {
     // Set tmp to be (markWord of object | UNLOCK_VALUE).
     orr(tmp, disp_hdr, markWord::unlocked_value);

     // Initialize the box. (Must happen before we update the object mark!)
     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));

     // Compare object markWord with an unlocked value (tmp) and if
     // equal exchange the stack address of our box with object markWord.
     // On failure disp_hdr contains the possibly locked markWord.
     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
             /*release*/ true, /*weak*/ false, disp_hdr);
     br(Assembler::EQ, cont);

     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");

     // If the compare-and-exchange succeeded, then we found an unlocked
     // object, will have now locked it will continue at label cont

     // Check if the owner is self by comparing the value in the
     // markWord of object (disp_hdr) with the stack pointer.
     mov(rscratch1, sp);
     sub(disp_hdr, disp_hdr, rscratch1);
     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
     // If condition is true we are cont and hence we can store 0 as the
     // displaced header in the box, which indicates that it is a recursive lock.
     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
     b(cont);
   } else {
     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
     lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, no_count);
     b(count);
   }

   // Handle existing monitor.
   bind(object_has_monitor);

   // The object's monitor m is unlocked iff m->owner == NULL,
   // otherwise m->owner may contain a thread or a stack address.
   //
   // Try to CAS m->owner from NULL to current thread.
   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
           /*release*/ true, /*weak*/ false, rscratch1); // Sets flags for result

   if (LockingMode != LM_LIGHTWEIGHT) {
     // Store a non-null value into the box to avoid looking like a re-entrant
     // lock. The fast-path monitor unlock code checks for
     // markWord::monitor_value so use markWord::unused_mark which has the
     // relevant bit set, and also matches ObjectSynchronizer::enter.
     mov(tmp, (address)markWord::unused_mark().value());
     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
   }
   br(Assembler::EQ, cont); // CAS success means locking succeeded

   cmp(rscratch1, rthread);
   br(Assembler::NE, cont); // Check for recursive locking

   // Recursive lock case
   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
   // flag == EQ still from the cmp above, checking if this is a reentrant lock

   bind(cont);
   // flag == EQ indicates success
   // flag == NE indicates failure
   br(Assembler::NE, no_count);

   bind(count);
   increment(Address(rthread, JavaThread::held_monitor_count_offset()));

   bind(no_count);
 }

 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
                                     Register tmp2Reg) {
   Register oop = objectReg;
   Register box = boxReg;
   Register disp_hdr = tmpReg;
   Register tmp = tmp2Reg;
   Label cont;
   Label object_has_monitor;
   Label count, no_count;

   assert_different_registers(oop, box, tmp, disp_hdr);

   if (LockingMode == LM_LEGACY) {
     // Find the lock address and load the displaced header from the stack.
     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));

     // If the displaced header is 0, we have a recursive unlock.
     cmp(disp_hdr, zr);
     br(Assembler::EQ, cont);
   }

   // Handle existing monitor.
   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);

   if (LockingMode == LM_MONITOR) {
     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
     b(cont);
   } else if (LockingMode == LM_LEGACY) {
     // Check if it is still a light weight lock, this is is true if we
     // see the stack address of the basicLock in the markWord of the
     // object.

     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
             /*release*/ true, /*weak*/ false, tmp);
     b(cont);
   } else {
     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
     lightweight_unlock(oop, tmp, box, disp_hdr, no_count);
     b(count);
   }

   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");

   // Handle existing monitor.
   bind(object_has_monitor);
   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
   add(tmp, tmp, -(int)markWord::monitor_value); // monitor

   if (LockingMode == LM_LIGHTWEIGHT) {
     // If the owner is anonymous, we need to fix it -- in an outline stub.
     Register tmp2 = disp_hdr;
     ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
     // We cannot use tbnz here, the target might be too far away and cannot
     // be encoded.
     tst(tmp2, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER);
     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
     Compile::current()->output()->add_stub(stub);
     br(Assembler::NE, stub->entry());
     bind(stub->continuation());
   }

   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));

   Label notRecursive;
   cbz(disp_hdr, notRecursive);

   // Recursive lock
   sub(disp_hdr, disp_hdr, 1u);
   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
   cmp(disp_hdr, disp_hdr); // Sets flags for result
   b(cont);

   bind(notRecursive);
   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
   cmp(rscratch1, zr); // Sets flags for result
   cbnz(rscratch1, cont);
   // need a release store here
   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
   stlr(zr, tmp); // set unowned

   bind(cont);
   // flag == EQ indicates success
   // flag == NE indicates failure
   br(Assembler::NE, no_count);

   bind(count);
   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));

   bind(no_count);
 }

 // Search for str1 in str2 and return index or -1
 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
                                        Register cnt2, Register cnt1,
                                        Register tmp1, Register tmp2,
                                        Register tmp3, Register tmp4,
                                        Register tmp5, Register tmp6,
                                        int icnt1, Register result, int ae) {
   // NOTE: tmp5, tmp6 can be zr depending on specific method version
   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;

   Register ch1 = rscratch1;
   Register ch2 = rscratch2;
   Register cnt1tmp = tmp1;
   Register cnt2tmp = tmp2;
   Register cnt1_neg = cnt1;
   Register cnt2_neg = cnt2;
   Register result_tmp = tmp4;

   bool isL = ae == StrIntrinsicNode::LL;

   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
   int str1_chr_shift = str1_isL ? 0:1;
   int str2_chr_shift = str2_isL ? 0:1;
   int str1_chr_size = str1_isL ? 1:2;
   int str2_chr_size = str2_isL ? 1:2;
   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
                                       (chr_insn)&MacroAssembler::ldrh;
   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
                                       (chr_insn)&MacroAssembler::ldrh;
   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;

   // Note, inline_string_indexOf() generates checks:
   // if (substr.count > string.count) return -1;
   // if (substr.count == 0) return 0;

   // We have two strings, a source string in str2, cnt2 and a pattern string
   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.

   // For larger pattern and source we use a simplified Boyer Moore algorithm.
   // With a small pattern and source we use linear scan.

   if (icnt1 == -1) {
     sub(result_tmp, cnt2, cnt1);
     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
     br(LT, LINEARSEARCH);
     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
     subs(zr, cnt1, 256);
     lsr(tmp1, cnt2, 2);
     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
     br(GE, LINEARSTUB);
   }

 // The Boyer Moore alogorithm is based on the description here:-
 //
 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 //
 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 // and the 'Good Suffix' rule.
 //
 // These rules are essentially heuristics for how far we can shift the
 // pattern along the search string.
 //
 // The implementation here uses the 'Bad Character' rule only because of the
 // complexity of initialisation for the 'Good Suffix' rule.
 //
 // This is also known as the Boyer-Moore-Horspool algorithm:-
 //
 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 //
 // This particular implementation has few java-specific optimizations.
 //
 // #define ASIZE 256
 //
 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 //       int i, j;
 //       unsigned c;
 //       unsigned char bc[ASIZE];
 //
 //       /* Preprocessing */
 //       for (i = 0; i < ASIZE; ++i)
 //          bc[i] = m;
 //       for (i = 0; i < m - 1; ) {
 //          c = x[i];
 //          ++i;
 //          // c < 256 for Latin1 string, so, no need for branch
 //          #ifdef PATTERN_STRING_IS_LATIN1
 //          bc[c] = m - i;
 //          #else
 //          if (c < ASIZE) bc[c] = m - i;
 //          #endif
 //       }
 //
 //       /* Searching */
 //       j = 0;
 //       while (j <= n - m) {
 //          c = y[i+j];
 //          if (x[m-1] == c)
 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 //          if (i < 0) return j;
 //          // c < 256 for Latin1 string, so, no need for branch
 //          #ifdef SOURCE_STRING_IS_LATIN1
 //          // LL case: (c< 256) always true. Remove branch
 //          j += bc[y[j+m-1]];
 //          #endif
 //          #ifndef PATTERN_STRING_IS_UTF
 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 //          if (c < ASIZE)
 //            j += bc[y[j+m-1]];
 //          else
 //            j += 1
 //          #endif
 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 //          if (c < ASIZE)
 //            j += bc[y[j+m-1]];
 //          else
 //            j += m
 //          #endif
 //       }
 //    }

   if (icnt1 == -1) {
     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
     Register cnt1end = tmp2;
     Register str2end = cnt2;
     Register skipch = tmp2;

     // str1 length is >=8, so, we can read at least 1 register for cases when
     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
     // UL case. We'll re-read last character in inner pre-loop code to have
     // single outer pre-loop load
     const int firstStep = isL ? 7 : 3;

     const int ASIZE = 256;
     const int STORED_BYTES = 32; // amount of bytes stored per instruction
     sub(sp, sp, ASIZE);
     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
     mov(ch1, sp);
     BIND(BM_INIT_LOOP);
       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
       subs(tmp5, tmp5, 1);
       br(GT, BM_INIT_LOOP);

       sub(cnt1tmp, cnt1, 1);
       mov(tmp5, str2);
       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
       sub(ch2, cnt1, 1);
       mov(tmp3, str1);
     BIND(BCLOOP);
       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
       if (!str1_isL) {
         subs(zr, ch1, ASIZE);
         br(HS, BCSKIP);
       }
       strb(ch2, Address(sp, ch1));
     BIND(BCSKIP);
       subs(ch2, ch2, 1);
       br(GT, BCLOOP);

       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
       if (str1_isL == str2_isL) {
         // load last 8 bytes (8LL/4UU symbols)
         ldr(tmp6, Address(tmp6, -wordSize));
       } else {
         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
         // convert Latin1 to UTF. We'll have to wait until load completed, but
         // it's still faster than per-character loads+checks
         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
         andr(tmp6, tmp6, 0xFF); // str1[N-4]
         orr(ch2, ch1, ch2, LSL, 16);
         orr(tmp6, tmp6, tmp3, LSL, 48);
         orr(tmp6, tmp6, ch2, LSL, 16);
       }
     BIND(BMLOOPSTR2);
       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
       if (str1_isL == str2_isL) {
         // re-init tmp3. It's for free because it's executed in parallel with
         // load above. Alternative is to initialize it before loop, but it'll
         // affect performance on in-order systems with 2 or more ld/st pipelines
         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
       }
       if (!isL) { // UU/UL case
         lsl(ch2, cnt1tmp, 1); // offset in bytes
       }
       cmp(tmp3, skipch);
       br(NE, BMSKIP);
       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
       mov(ch1, tmp6);
       if (isL) {
         b(BMLOOPSTR1_AFTER_LOAD);
       } else {
         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
         b(BMLOOPSTR1_CMP);
       }
     BIND(BMLOOPSTR1);
       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
     BIND(BMLOOPSTR1_AFTER_LOAD);
       subs(cnt1tmp, cnt1tmp, 1);
       br(LT, BMLOOPSTR1_LASTCMP);
     BIND(BMLOOPSTR1_CMP);
       cmp(ch1, ch2);
       br(EQ, BMLOOPSTR1);
     BIND(BMSKIP);
       if (!isL) {
         // if we've met UTF symbol while searching Latin1 pattern, then we can
         // skip cnt1 symbols
         if (str1_isL != str2_isL) {
           mov(result_tmp, cnt1);
         } else {
           mov(result_tmp, 1);
         }
         subs(zr, skipch, ASIZE);
         br(HS, BMADV);
       }
       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
     BIND(BMADV);
       sub(cnt1tmp, cnt1, 1);
       add(str2, str2, result_tmp, LSL, str2_chr_shift);
       cmp(str2, str2end);
       br(LE, BMLOOPSTR2);
       add(sp, sp, ASIZE);
       b(NOMATCH);
     BIND(BMLOOPSTR1_LASTCMP);
       cmp(ch1, ch2);
       br(NE, BMSKIP);
     BIND(BMMATCH);
       sub(result, str2, tmp5);
       if (!str2_isL) lsr(result, result, 1);
       add(sp, sp, ASIZE);
       b(DONE);

     BIND(LINEARSTUB);
     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
     br(LT, LINEAR_MEDIUM);
     mov(result, zr);
     RuntimeAddress stub = nullptr;
     if (isL) {
       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
     } else if (str1_isL) {
       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
     } else {
       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
     }
     address call = trampoline_call(stub);
     if (call == nullptr) {
       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
       ciEnv::current()->record_failure("CodeCache is full");
       return;
     }
     b(DONE);
   }

   BIND(LINEARSEARCH);
   {
     Label DO1, DO2, DO3;

     Register str2tmp = tmp2;
     Register first = tmp3;

     if (icnt1 == -1)
     {
         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;

         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
         br(LT, DOSHORT);
       BIND(LINEAR_MEDIUM);
         (this->*str1_load_1chr)(first, Address(str1));
         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);

       BIND(FIRST_LOOP);
         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
         cmp(first, ch2);
         br(EQ, STR1_LOOP);
       BIND(STR2_NEXT);
         adds(cnt2_neg, cnt2_neg, str2_chr_size);
         br(LE, FIRST_LOOP);
         b(NOMATCH);

       BIND(STR1_LOOP);
         adds(cnt1tmp, cnt1_neg, str1_chr_size);
         add(cnt2tmp, cnt2_neg, str2_chr_size);
         br(GE, MATCH);

       BIND(STR1_NEXT);
         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
         cmp(ch1, ch2);
         br(NE, STR2_NEXT);
         adds(cnt1tmp, cnt1tmp, str1_chr_size);
         add(cnt2tmp, cnt2tmp, str2_chr_size);
         br(LT, STR1_NEXT);
         b(MATCH);

       BIND(DOSHORT);
       if (str1_isL == str2_isL) {
         cmp(cnt1, (u1)2);
         br(LT, DO1);
         br(GT, DO3);
       }
     }

     if (icnt1 == 4) {
       Label CH1_LOOP;

         (this->*load_4chr)(ch1, str1);
         sub(result_tmp, cnt2, 4);
         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);

       BIND(CH1_LOOP);
         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
         cmp(ch1, ch2);
         br(EQ, MATCH);
         adds(cnt2_neg, cnt2_neg, str2_chr_size);
         br(LE, CH1_LOOP);
         b(NOMATCH);
       }

     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
       Label CH1_LOOP;

       BIND(DO2);
         (this->*load_2chr)(ch1, str1);
         if (icnt1 == 2) {
           sub(result_tmp, cnt2, 2);
         }
         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
       BIND(CH1_LOOP);
         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
         cmp(ch1, ch2);
         br(EQ, MATCH);
         adds(cnt2_neg, cnt2_neg, str2_chr_size);
         br(LE, CH1_LOOP);
         b(NOMATCH);
     }

     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;

       BIND(DO3);
         (this->*load_2chr)(first, str1);
         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
         if (icnt1 == 3) {
           sub(result_tmp, cnt2, 3);
         }
         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
       BIND(FIRST_LOOP);
         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
         cmpw(first, ch2);
         br(EQ, STR1_LOOP);
       BIND(STR2_NEXT);
         adds(cnt2_neg, cnt2_neg, str2_chr_size);
         br(LE, FIRST_LOOP);
         b(NOMATCH);

       BIND(STR1_LOOP);
         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
         cmp(ch1, ch2);
         br(NE, STR2_NEXT);
         b(MATCH);
     }

     if (icnt1 == -1 || icnt1 == 1) {
       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;

       BIND(DO1);
         (this->*str1_load_1chr)(ch1, str1);
         cmp(cnt2, (u1)8);
         br(LT, DO1_SHORT);

         sub(result_tmp, cnt2, 8/str2_chr_size);
         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));

         if (str2_isL) {
           orr(ch1, ch1, ch1, LSL, 8);
         }
         orr(ch1, ch1, ch1, LSL, 16);
         orr(ch1, ch1, ch1, LSL, 32);
       BIND(CH1_LOOP);
         ldr(ch2, Address(str2, cnt2_neg));
         eor(ch2, ch1, ch2);
         sub(tmp1, ch2, tmp3);
         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
         bics(tmp1, tmp1, tmp2);
         br(NE, HAS_ZERO);
         adds(cnt2_neg, cnt2_neg, 8);
         br(LT, CH1_LOOP);

         cmp(cnt2_neg, (u1)8);
         mov(cnt2_neg, 0);
         br(LT, CH1_LOOP);
         b(NOMATCH);

       BIND(HAS_ZERO);
         rev(tmp1, tmp1);
         clz(tmp1, tmp1);
         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
         b(MATCH);

       BIND(DO1_SHORT);
         mov(result_tmp, cnt2);
         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
       BIND(DO1_LOOP);
         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
         cmpw(ch1, ch2);
         br(EQ, MATCH);
         adds(cnt2_neg, cnt2_neg, str2_chr_size);
         br(LT, DO1_LOOP);
     }
   }
   BIND(NOMATCH);
     mov(result, -1);
     b(DONE);
   BIND(MATCH);
     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
   BIND(DONE);
 }

 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);

 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
                                             Register ch, Register result,
                                             Register tmp1, Register tmp2, Register tmp3)
 {
   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
   Register cnt1_neg = cnt1;
   Register ch1 = rscratch1;
   Register result_tmp = rscratch2;

   cbz(cnt1, NOMATCH);

   cmp(cnt1, (u1)4);
   br(LT, DO1_SHORT);

   orr(ch, ch, ch, LSL, 16);
   orr(ch, ch, ch, LSL, 32);

   sub(cnt1, cnt1, 4);
   mov(result_tmp, cnt1);
   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
   sub(cnt1_neg, zr, cnt1, LSL, 1);

   mov(tmp3, 0x0001000100010001);

   BIND(CH1_LOOP);
     ldr(ch1, Address(str1, cnt1_neg));
     eor(ch1, ch, ch1);
     sub(tmp1, ch1, tmp3);
     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
     bics(tmp1, tmp1, tmp2);
     br(NE, HAS_ZERO);
     adds(cnt1_neg, cnt1_neg, 8);
     br(LT, CH1_LOOP);

     cmp(cnt1_neg, (u1)8);
     mov(cnt1_neg, 0);
     br(LT, CH1_LOOP);
     b(NOMATCH);

   BIND(HAS_ZERO);
     rev(tmp1, tmp1);
     clz(tmp1, tmp1);
     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
     b(MATCH);

   BIND(DO1_SHORT);
     mov(result_tmp, cnt1);
     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
     sub(cnt1_neg, zr, cnt1, LSL, 1);
   BIND(DO1_LOOP);
     ldrh(ch1, Address(str1, cnt1_neg));
     cmpw(ch, ch1);
     br(EQ, MATCH);
     adds(cnt1_neg, cnt1_neg, 2);
     br(LT, DO1_LOOP);
   BIND(NOMATCH);
     mov(result, -1);
     b(DONE);
   BIND(MATCH);
     add(result, result_tmp, cnt1_neg, ASR, 1);
   BIND(DONE);
 }

 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
                                                 Register ch, Register result,
                                                 FloatRegister ztmp1,
                                                 FloatRegister ztmp2,
                                                 PRegister tmp_pg,
                                                 PRegister tmp_pdn, bool isL)
 {
   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
   assert(tmp_pg->is_governing(),
          "this register has to be a governing predicate register");

   Label LOOP, MATCH, DONE, NOMATCH;
   Register vec_len = rscratch1;
   Register idx = rscratch2;

   SIMD_RegVariant T = (isL == true) ? B : H;

   cbz(cnt1, NOMATCH);

   // Assign the particular char throughout the vector.
   sve_dup(ztmp2, T, ch);
   if (isL) {
     sve_cntb(vec_len);
   } else {
     sve_cnth(vec_len);
   }
   mov(idx, 0);

   // Generate a predicate to control the reading of input string.
   sve_whilelt(tmp_pg, T, idx, cnt1);

   BIND(LOOP);
     // Read a vector of 8- or 16-bit data depending on the string type. Note
     // that inactive elements indicated by the predicate register won't cause
     // a data read from memory to the destination vector.
     if (isL) {
       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
     } else {
       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
     }
     add(idx, idx, vec_len);

     // Perform the comparison. An element of the destination predicate is set
     // to active if the particular char is matched.
     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);

     // Branch if the particular char is found.
     br(NE, MATCH);

     sve_whilelt(tmp_pg, T, idx, cnt1);

     // Loop back if the particular char not found.
     br(MI, LOOP);

   BIND(NOMATCH);
     mov(result, -1);
     b(DONE);

   BIND(MATCH);
     // Undo the index increment.
     sub(idx, idx, vec_len);

     // Crop the vector to find its location.
     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
     add(result, idx, -1);
     sve_incp(result, T, tmp_pdn);
   BIND(DONE);
 }

 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
                                             Register ch, Register result,
                                             Register tmp1, Register tmp2, Register tmp3)
 {
   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
   Register cnt1_neg = cnt1;
   Register ch1 = rscratch1;
   Register result_tmp = rscratch2;

   cbz(cnt1, NOMATCH);

   cmp(cnt1, (u1)8);
   br(LT, DO1_SHORT);

   orr(ch, ch, ch, LSL, 8);
   orr(ch, ch, ch, LSL, 16);
   orr(ch, ch, ch, LSL, 32);

   sub(cnt1, cnt1, 8);
   mov(result_tmp, cnt1);
   lea(str1, Address(str1, cnt1));
   sub(cnt1_neg, zr, cnt1);

   mov(tmp3, 0x0101010101010101);

   BIND(CH1_LOOP);
     ldr(ch1, Address(str1, cnt1_neg));
     eor(ch1, ch, ch1);
     sub(tmp1, ch1, tmp3);
     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
     bics(tmp1, tmp1, tmp2);
     br(NE, HAS_ZERO);
     adds(cnt1_neg, cnt1_neg, 8);
     br(LT, CH1_LOOP);

     cmp(cnt1_neg, (u1)8);
     mov(cnt1_neg, 0);
     br(LT, CH1_LOOP);
     b(NOMATCH);

   BIND(HAS_ZERO);
     rev(tmp1, tmp1);
     clz(tmp1, tmp1);
     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
     b(MATCH);

   BIND(DO1_SHORT);
     mov(result_tmp, cnt1);
     lea(str1, Address(str1, cnt1));
     sub(cnt1_neg, zr, cnt1);
   BIND(DO1_LOOP);
     ldrb(ch1, Address(str1, cnt1_neg));
     cmp(ch, ch1);
     br(EQ, MATCH);
     adds(cnt1_neg, cnt1_neg, 1);
     br(LT, DO1_LOOP);
   BIND(NOMATCH);
     mov(result, -1);
     b(DONE);
   BIND(MATCH);
     add(result, result_tmp, cnt1_neg);
   BIND(DONE);
 }

 // Compare strings.
 void C2_MacroAssembler::string_compare(Register str1, Register str2,
     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
     PRegister pgtmp1, PRegister pgtmp2, int ae) {
   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
       SHORT_LOOP_START, TAIL_CHECK;

   bool isLL = ae == StrIntrinsicNode::LL;
   bool isLU = ae == StrIntrinsicNode::LU;
   bool isUL = ae == StrIntrinsicNode::UL;

   // The stub threshold for LL strings is: 72 (64 + 8) chars
   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);

   bool str1_isL = isLL || isLU;
   bool str2_isL = isLL || isUL;

   int str1_chr_shift = str1_isL ? 0 : 1;
   int str2_chr_shift = str2_isL ? 0 : 1;
   int str1_chr_size = str1_isL ? 1 : 2;
   int str2_chr_size = str2_isL ? 1 : 2;
   int minCharsInWord = isLL ? wordSize : wordSize/2;

   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
                                       (chr_insn)&MacroAssembler::ldrh;
   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
                                       (chr_insn)&MacroAssembler::ldrh;
   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
                             (uxt_insn)&MacroAssembler::uxthw;

   BLOCK_COMMENT("string_compare {");

   // Bizzarely, the counts are passed in bytes, regardless of whether they
   // are L or U strings, however the result is always in characters.
   if (!str1_isL) asrw(cnt1, cnt1, 1);
   if (!str2_isL) asrw(cnt2, cnt2, 1);

   // Compute the minimum of the string lengths and save the difference.
   subsw(result, cnt1, cnt2);
   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min

   // A very short string
   cmpw(cnt2, minCharsInWord);
   br(Assembler::LE, SHORT_STRING);

   // Compare longwords
   // load first parts of strings and finish initialization while loading
   {
     if (str1_isL == str2_isL) { // LL or UU
       ldr(tmp1, Address(str1));
       cmp(str1, str2);
       br(Assembler::EQ, DONE);
       ldr(tmp2, Address(str2));
       cmp(cnt2, stub_threshold);
       br(GE, STUB);
       subsw(cnt2, cnt2, minCharsInWord);
       br(EQ, TAIL_CHECK);
       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
     } else if (isLU) {
       ldrs(vtmp, Address(str1));
       ldr(tmp2, Address(str2));
       cmp(cnt2, stub_threshold);
       br(GE, STUB);
       subw(cnt2, cnt2, 4);
       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
       zip1(vtmp, T8B, vtmp, vtmpZ);
       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
       add(cnt1, cnt1, 4);
       fmovd(tmp1, vtmp);
     } else { // UL case
       ldr(tmp1, Address(str1));
       ldrs(vtmp, Address(str2));
       cmp(cnt2, stub_threshold);
       br(GE, STUB);
       subw(cnt2, cnt2, 4);
       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
       zip1(vtmp, T8B, vtmp, vtmpZ);
       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
       add(cnt1, cnt1, 8);
       fmovd(tmp2, vtmp);
     }
     adds(cnt2, cnt2, isUL ? 4 : 8);
     br(GE, TAIL);
     eor(rscratch2, tmp1, tmp2);
     cbnz(rscratch2, DIFF);
     // main loop
     bind(NEXT_WORD);
     if (str1_isL == str2_isL) {
       ldr(tmp1, Address(str1, cnt2));
       ldr(tmp2, Address(str2, cnt2));
       adds(cnt2, cnt2, 8);
     } else if (isLU) {
       ldrs(vtmp, Address(str1, cnt1));
       ldr(tmp2, Address(str2, cnt2));
       add(cnt1, cnt1, 4);
       zip1(vtmp, T8B, vtmp, vtmpZ);
       fmovd(tmp1, vtmp);
       adds(cnt2, cnt2, 8);
     } else { // UL
       ldrs(vtmp, Address(str2, cnt2));
       ldr(tmp1, Address(str1, cnt1));
       zip1(vtmp, T8B, vtmp, vtmpZ);
       add(cnt1, cnt1, 8);
       fmovd(tmp2, vtmp);
       adds(cnt2, cnt2, 4);
     }
     br(GE, TAIL);

     eor(rscratch2, tmp1, tmp2);
     cbz(rscratch2, NEXT_WORD);
     b(DIFF);
     bind(TAIL);
     eor(rscratch2, tmp1, tmp2);
     cbnz(rscratch2, DIFF);
     // Last longword.  In the case where length == 4 we compare the
     // same longword twice, but that's still faster than another
     // conditional branch.
     if (str1_isL == str2_isL) {
       ldr(tmp1, Address(str1));
       ldr(tmp2, Address(str2));
     } else if (isLU) {
       ldrs(vtmp, Address(str1));
       ldr(tmp2, Address(str2));
       zip1(vtmp, T8B, vtmp, vtmpZ);
       fmovd(tmp1, vtmp);
     } else { // UL
       ldrs(vtmp, Address(str2));
       ldr(tmp1, Address(str1));
       zip1(vtmp, T8B, vtmp, vtmpZ);
       fmovd(tmp2, vtmp);
     }
     bind(TAIL_CHECK);
     eor(rscratch2, tmp1, tmp2);
     cbz(rscratch2, DONE);

     // Find the first different characters in the longwords and
     // compute their difference.
     bind(DIFF);
     rev(rscratch2, rscratch2);
     clz(rscratch2, rscratch2);
     andr(rscratch2, rscratch2, isLL ? -8 : -16);
     lsrv(tmp1, tmp1, rscratch2);
     (this->*ext_chr)(tmp1, tmp1);
     lsrv(tmp2, tmp2, rscratch2);
     (this->*ext_chr)(tmp2, tmp2);
     subw(result, tmp1, tmp2);
     b(DONE);
   }

   bind(STUB);
     RuntimeAddress stub = nullptr;
     switch(ae) {
       case StrIntrinsicNode::LL:
         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
         break;
       case StrIntrinsicNode::UU:
         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
         break;
       case StrIntrinsicNode::LU:
         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
         break;
       case StrIntrinsicNode::UL:
         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
         break;
       default:
         ShouldNotReachHere();
      }
     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
     address call = trampoline_call(stub);
     if (call == nullptr) {
       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
       ciEnv::current()->record_failure("CodeCache is full");
       return;
     }
     b(DONE);

   bind(SHORT_STRING);
   // Is the minimum length zero?
   cbz(cnt2, DONE);
   // arrange code to do most branches while loading and loading next characters
   // while comparing previous
   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
   subs(cnt2, cnt2, 1);
   br(EQ, SHORT_LAST_INIT);
   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
   b(SHORT_LOOP_START);
   bind(SHORT_LOOP);
   subs(cnt2, cnt2, 1);
   br(EQ, SHORT_LAST);
   bind(SHORT_LOOP_START);
   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
   cmp(tmp1, cnt1);
   br(NE, SHORT_LOOP_TAIL);
   subs(cnt2, cnt2, 1);
   br(EQ, SHORT_LAST2);
   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
   cmp(tmp2, rscratch1);
   br(EQ, SHORT_LOOP);
   sub(result, tmp2, rscratch1);
   b(DONE);
   bind(SHORT_LOOP_TAIL);
   sub(result, tmp1, cnt1);
   b(DONE);
   bind(SHORT_LAST2);
   cmp(tmp2, rscratch1);
   br(EQ, DONE);
   sub(result, tmp2, rscratch1);

   b(DONE);
   bind(SHORT_LAST_INIT);
   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
   bind(SHORT_LAST);
   cmp(tmp1, cnt1);
   br(EQ, DONE);
   sub(result, tmp1, cnt1);

   bind(DONE);

   BLOCK_COMMENT("} string_compare");
 }

 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
                                      FloatRegister src2, Condition cond, bool isQ) {
   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
   FloatRegister zn = src1, zm = src2;
   bool needs_negation = false;
   switch (cond) {
     case LT: cond = GT; zn = src2; zm = src1; break;
     case LE: cond = GE; zn = src2; zm = src1; break;
     case LO: cond = HI; zn = src2; zm = src1; break;
     case LS: cond = HS; zn = src2; zm = src1; break;
     case NE: cond = EQ; needs_negation = true; break;
     default:
       break;
   }

   if (is_floating_point_type(bt)) {
     fcm(cond, dst, size, zn, zm);
   } else {
     cm(cond, dst, size, zn, zm);
   }

   if (needs_negation) {
     notr(dst, isQ ? T16B : T8B, dst);
   }
 }

 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
                                           Condition cond, bool isQ) {
   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
   if (bt == T_FLOAT || bt == T_DOUBLE) {
     if (cond == Assembler::NE) {
       fcm(Assembler::EQ, dst, size, src);
       notr(dst, isQ ? T16B : T8B, dst);
     } else {
       fcm(cond, dst, size, src);
     }
   } else {
     if (cond == Assembler::NE) {
       cm(Assembler::EQ, dst, size, src);
       notr(dst, isQ ? T16B : T8B, dst);
     } else {
       cm(cond, dst, size, src);
     }
   }
 }

 // Compress the least significant bit of each byte to the rightmost and clear
 // the higher garbage bits.
 void C2_MacroAssembler::bytemask_compress(Register dst) {
   // Example input, dst = 0x01 00 00 00 01 01 00 01
   // The "??" bytes are garbage.
   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
   andr(dst, dst, 0xff);                   // dst = 0x8D
 }

 // Pack the lowest-numbered bit of each mask element in src into a long value
 // in dst, at most the first 64 lane elements.
 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
                                          FloatRegister vtmp1, FloatRegister vtmp2) {
   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
   assert_different_registers(dst, rscratch1);
   assert_different_registers(vtmp1, vtmp2);

   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
   // Expected:  dst = 0x658D

   // Convert the mask into vector with sequential bytes.
   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
   sve_cpy(vtmp1, size, src, 1, false);
   if (bt != T_BYTE) {
     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
   }

   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
     // is to compress each significant bit of the byte in a cross-lane way. Due
     // to the lack of a cross-lane bit-compress instruction, we use BEXT
     // (bit-compress in each lane) with the biggest lane size (T = D) then
     // concatenate the results.

     // The second source input of BEXT, initialized with 0x01 in each byte.
     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
     sve_dup(vtmp2, B, 1);

     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
     //         ---------------------------------------
     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
     sve_bext(vtmp1, D, vtmp1, vtmp2);

     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
     // result to dst.
     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
     // dst   = 0x658D
     if (lane_cnt <= 8) {
       // No need to concatenate.
       umov(dst, vtmp1, B, 0);
     } else if (lane_cnt <= 16) {
       ins(vtmp1, B, vtmp1, 1, 8);
       umov(dst, vtmp1, H, 0);
     } else {
       // As the lane count is 64 at most, the final expected value must be in
       // the lowest 64 bits after narrowing vtmp1 from D to B.
       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
       umov(dst, vtmp1, D, 0);
     }
   } else if (UseSVE > 0) {
     // Compress the lowest 8 bytes.
     fmovd(dst, vtmp1);
     bytemask_compress(dst);
     if (lane_cnt <= 8) return;

     // Repeat on higher bytes and join the results.
     // Compress 8 bytes in each iteration.
     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
       bytemask_compress(rscratch1);
       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
     }
   } else {
     assert(false, "unsupported");
     ShouldNotReachHere();
   }
 }

 // Unpack the mask, a long value in src, into predicate register dst based on the
 // corresponding data type. Note that dst can support at most 64 lanes.
 // Below example gives the expected dst predicate register in different types, with
 // a valid src(0x658D) on a 1024-bit vector size machine.
 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
 //
 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
 // has 24 significant bits would be an invalid input if dst predicate register refers to
 // a LONG type 1024-bit vector, which has at most 16 lanes.
 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
                                            FloatRegister vtmp1, FloatRegister vtmp2) {
   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
   // Expected:  dst = 0b01101001 10001101

   // Put long value from general purpose register into the first lane of vector.
   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
   sve_dup(vtmp1, B, 0);
   mov(vtmp1, D, 0, src);

   // As sve_cmp generates mask value with the minimum unit in byte, we should
   // transform the value in the first lane which is mask in bit now to the
   // mask in byte, which can be done by SVE2's BDEP instruction.

   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
   if (lane_cnt <= 8) {
     // Nothing. As only one byte exsits.
   } else if (lane_cnt <= 16) {
     ins(vtmp1, B, vtmp1, 8, 1);
     mov(vtmp1, B, 1, zr);
   } else {
     sve_vector_extend(vtmp1, D, vtmp1, B);
   }

   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
   sve_dup(vtmp2, B, 1);

   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
   //         ---------------------------------------
   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
   sve_bdep(vtmp1, D, vtmp1, vtmp2);

   if (bt != T_BYTE) {
     sve_vector_extend(vtmp1, size, vtmp1, B);
   }
   // Generate mask according to the given vector, in which the elements have been
   // extended to expected type.
   // dst = 0b01101001 10001101
   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
 }

 // Clobbers: rflags
 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
                                     FloatRegister zn, FloatRegister zm, Condition cond) {
   assert(pg->is_governing(), "This register has to be a governing predicate register");
   FloatRegister z1 = zn, z2 = zm;
   switch (cond) {
     case LE: z1 = zm; z2 = zn; cond = GE; break;
     case LT: z1 = zm; z2 = zn; cond = GT; break;
     case LO: z1 = zm; z2 = zn; cond = HI; break;
     case LS: z1 = zm; z2 = zn; cond = HS; break;
     default:
       break;
   }

   SIMD_RegVariant size = elemType_to_regVariant(bt);
   if (is_floating_point_type(bt)) {
     sve_fcm(cond, pd, size, pg, z1, z2);
   } else {
     assert(is_integral_type(bt), "unsupported element type");
     sve_cmp(cond, pd, size, pg, z1, z2);
   }
 }

 // Get index of the last mask lane that is set
 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
   SIMD_RegVariant size = elemType_to_regVariant(bt);
   sve_rev(ptmp, size, src);
   sve_brkb(ptmp, ptrue, ptmp, false);
   sve_cntp(dst, size, ptrue, ptmp);
   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
   subw(dst, rscratch1, dst);
 }

 // Extend integer vector src to dst with the same lane count
 // but larger element size, e.g. 4B -> 4I
 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
                                            FloatRegister src, BasicType src_bt) {
   if (src_bt == T_BYTE) {
     if (dst_bt == T_SHORT) {
       // 4B/8B to 4S/8S
       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
       sxtl(dst, T8H, src, T8B);
     } else {
       // 4B to 4I
       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
       sxtl(dst, T8H, src, T8B);
       sxtl(dst, T4S, dst, T4H);
     }
   } else if (src_bt == T_SHORT) {
     // 4S to 4I
     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
     sxtl(dst, T4S, src, T4H);
   } else if (src_bt == T_INT) {
     // 2I to 2L
     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
     sxtl(dst, T2D, src, T2S);
   } else {
     ShouldNotReachHere();
   }
 }

 // Narrow integer vector src down to dst with the same lane count
 // but smaller element size, e.g. 4I -> 4B
 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
   if (src_bt == T_SHORT) {
     // 4S/8S to 4B/8B
     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
     assert(dst_bt == T_BYTE, "unsupported");
     xtn(dst, T8B, src, T8H);
   } else if (src_bt == T_INT) {
     // 4I to 4B/4S
     assert(src_vlen_in_bytes == 16, "unsupported");
     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
     xtn(dst, T4H, src, T4S);
     if (dst_bt == T_BYTE) {
       xtn(dst, T8B, dst, T8H);
     }
   } else if (src_bt == T_LONG) {
     // 2L to 2I
     assert(src_vlen_in_bytes == 16, "unsupported");
     assert(dst_bt == T_INT, "unsupported");
     xtn(dst, T2S, src, T2D);
   } else {
     ShouldNotReachHere();
   }
 }

 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
                                           FloatRegister src, SIMD_RegVariant src_size) {
   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
   if (src_size == B) {
     switch (dst_size) {
     case H:
       sve_sunpklo(dst, H, src);
       break;
     case S:
       sve_sunpklo(dst, H, src);
       sve_sunpklo(dst, S, dst);
       break;
     case D:
       sve_sunpklo(dst, H, src);
       sve_sunpklo(dst, S, dst);
       sve_sunpklo(dst, D, dst);
       break;
     default:
       ShouldNotReachHere();
     }
   } else if (src_size == H) {
     if (dst_size == S) {
       sve_sunpklo(dst, S, src);
     } else { // D
       sve_sunpklo(dst, S, src);
       sve_sunpklo(dst, D, dst);
     }
   } else if (src_size == S) {
     sve_sunpklo(dst, D, src);
   }
 }

 // Vector narrow from src to dst with specified element sizes.
 // High part of dst vector will be filled with zero.
 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
                                           FloatRegister src, SIMD_RegVariant src_size,
                                           FloatRegister tmp) {
   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
   assert_different_registers(src, tmp);
   sve_dup(tmp, src_size, 0);
   if (src_size == D) {
     switch (dst_size) {
     case S:
       sve_uzp1(dst, S, src, tmp);
       break;
     case H:
       assert_different_registers(dst, tmp);
       sve_uzp1(dst, S, src, tmp);
       sve_uzp1(dst, H, dst, tmp);
       break;
     case B:
       assert_different_registers(dst, tmp);
       sve_uzp1(dst, S, src, tmp);
       sve_uzp1(dst, H, dst, tmp);
       sve_uzp1(dst, B, dst, tmp);
       break;
     default:
       ShouldNotReachHere();
     }
   } else if (src_size == S) {
     if (dst_size == H) {
       sve_uzp1(dst, H, src, tmp);
     } else { // B
       assert_different_registers(dst, tmp);
       sve_uzp1(dst, H, src, tmp);
       sve_uzp1(dst, B, dst, tmp);
     }
   } else if (src_size == H) {
     sve_uzp1(dst, B, src, tmp);
   }
 }

 // Extend src predicate to dst predicate with the same lane count but larger
 // element size, e.g. 64Byte -> 512Long
 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
                                              uint dst_element_length_in_bytes,
                                              uint src_element_length_in_bytes) {
   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
     sve_punpklo(dst, src);
   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
     sve_punpklo(dst, src);
     sve_punpklo(dst, dst);
   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
     sve_punpklo(dst, src);
     sve_punpklo(dst, dst);
     sve_punpklo(dst, dst);
   } else {
     assert(false, "unsupported");
     ShouldNotReachHere();
   }
 }

 // Narrow src predicate to dst predicate with the same lane count but
 // smaller element size, e.g. 512Long -> 64Byte
 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
   // The insignificant bits in src predicate are expected to be zero.
   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
   // passed as the second argument. An example narrowing operation with a given mask would be -
   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
   // Mask (for 2 Longs) : TF
   // Predicate register for the above mask (16 bits) : 00000001 00000000
   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
   assert_different_registers(src, ptmp);
   assert_different_registers(dst, ptmp);
   sve_pfalse(ptmp);
   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
     sve_uzp1(dst, B, src, ptmp);
   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
     sve_uzp1(dst, H, src, ptmp);
     sve_uzp1(dst, B, dst, ptmp);
   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
     sve_uzp1(dst, S, src, ptmp);
     sve_uzp1(dst, H, dst, ptmp);
     sve_uzp1(dst, B, dst, ptmp);
   } else {
     assert(false, "unsupported");
     ShouldNotReachHere();
   }
 }

 // Vector reduction add for integral type with ASIMD instructions.
 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
                                                  Register isrc, FloatRegister vsrc,
                                                  unsigned vector_length_in_bytes,
                                                  FloatRegister vtmp) {
   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
   assert_different_registers(dst, isrc);
   bool isQ = vector_length_in_bytes == 16;

   BLOCK_COMMENT("neon_reduce_add_integral {");
     switch(bt) {
       case T_BYTE:
         addv(vtmp, isQ ? T16B : T8B, vsrc);
         smov(dst, vtmp, B, 0);
         addw(dst, dst, isrc, ext::sxtb);
         break;
       case T_SHORT:
         addv(vtmp, isQ ? T8H : T4H, vsrc);
         smov(dst, vtmp, H, 0);
         addw(dst, dst, isrc, ext::sxth);
         break;
       case T_INT:
         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
         umov(dst, vtmp, S, 0);
         addw(dst, dst, isrc);
         break;
       case T_LONG:
         assert(isQ, "unsupported");
         addpd(vtmp, vsrc);
         umov(dst, vtmp, D, 0);
         add(dst, dst, isrc);
         break;
       default:
         assert(false, "unsupported");
         ShouldNotReachHere();
     }
   BLOCK_COMMENT("} neon_reduce_add_integral");
 }

 // Vector reduction multiply for integral type with ASIMD instructions.
 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
 // Clobbers: rscratch1
 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
                                                  Register isrc, FloatRegister vsrc,
                                                  unsigned vector_length_in_bytes,
                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
   bool isQ = vector_length_in_bytes == 16;

   BLOCK_COMMENT("neon_reduce_mul_integral {");
     switch(bt) {
       case T_BYTE:
         if (isQ) {
           // Multiply the lower half and higher half of vector iteratively.
           // vtmp1 = vsrc[8:15]
           ins(vtmp1, D, vsrc, 0, 1);
           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
           mulv(vtmp1, T8B, vtmp1, vsrc);
           // vtmp2 = vtmp1[4:7]
           ins(vtmp2, S, vtmp1, 0, 1);
           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
           mulv(vtmp1, T8B, vtmp2, vtmp1);
         } else {
           ins(vtmp1, S, vsrc, 0, 1);
           mulv(vtmp1, T8B, vtmp1, vsrc);
         }
         // vtmp2 = vtmp1[2:3]
         ins(vtmp2, H, vtmp1, 0, 1);
         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
         mulv(vtmp2, T8B, vtmp2, vtmp1);
         // dst = vtmp2[0] * isrc * vtmp2[1]
         umov(rscratch1, vtmp2, B, 0);
         mulw(dst, rscratch1, isrc);
         sxtb(dst, dst);
         umov(rscratch1, vtmp2, B, 1);
         mulw(dst, rscratch1, dst);
         sxtb(dst, dst);
         break;
       case T_SHORT:
         if (isQ) {
           ins(vtmp2, D, vsrc, 0, 1);
           mulv(vtmp2, T4H, vtmp2, vsrc);
           ins(vtmp1, S, vtmp2, 0, 1);
           mulv(vtmp1, T4H, vtmp1, vtmp2);
         } else {
           ins(vtmp1, S, vsrc, 0, 1);
           mulv(vtmp1, T4H, vtmp1, vsrc);
         }
         umov(rscratch1, vtmp1, H, 0);
         mulw(dst, rscratch1, isrc);
         sxth(dst, dst);
         umov(rscratch1, vtmp1, H, 1);
         mulw(dst, rscratch1, dst);
         sxth(dst, dst);
         break;
       case T_INT:
         if (isQ) {
           ins(vtmp1, D, vsrc, 0, 1);
           mulv(vtmp1, T2S, vtmp1, vsrc);
         } else {
           vtmp1 = vsrc;
         }
         umov(rscratch1, vtmp1, S, 0);
         mul(dst, rscratch1, isrc);
         umov(rscratch1, vtmp1, S, 1);
         mul(dst, rscratch1, dst);
         break;
       case T_LONG:
         umov(rscratch1, vsrc, D, 0);
         mul(dst, isrc, rscratch1);
         umov(rscratch1, vsrc, D, 1);
         mul(dst, dst, rscratch1);
         break;
       default:
         assert(false, "unsupported");
         ShouldNotReachHere();
     }
   BLOCK_COMMENT("} neon_reduce_mul_integral");
 }

 // Vector reduction multiply for floating-point type with ASIMD instructions.
 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
                                            FloatRegister fsrc, FloatRegister vsrc,
                                            unsigned vector_length_in_bytes,
                                            FloatRegister vtmp) {
   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
   bool isQ = vector_length_in_bytes == 16;

   BLOCK_COMMENT("neon_reduce_mul_fp {");
     switch(bt) {
       case T_FLOAT:
         fmuls(dst, fsrc, vsrc);
         ins(vtmp, S, vsrc, 0, 1);
         fmuls(dst, dst, vtmp);
         if (isQ) {
           ins(vtmp, S, vsrc, 0, 2);
           fmuls(dst, dst, vtmp);
           ins(vtmp, S, vsrc, 0, 3);
           fmuls(dst, dst, vtmp);
          }
         break;
       case T_DOUBLE:
         assert(isQ, "unsupported");
         fmuld(dst, fsrc, vsrc);
         ins(vtmp, D, vsrc, 0, 1);
         fmuld(dst, dst, vtmp);
         break;
       default:
         assert(false, "unsupported");
         ShouldNotReachHere();
     }
   BLOCK_COMMENT("} neon_reduce_mul_fp");
 }

 // Helper to select logical instruction
 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
                                                    Register Rn, Register Rm,
                                                    enum shift_kind kind, unsigned shift) {
   switch(opc) {
     case Op_AndReductionV:
       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
       break;
     case Op_OrReductionV:
       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
       break;
     case Op_XorReductionV:
       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
       break;
     default:
       assert(false, "unsupported");
       ShouldNotReachHere();
   }
 }

 // Vector reduction logical operations And, Or, Xor
 // Clobbers: rscratch1
 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
                                             Register isrc, FloatRegister vsrc,
                                             unsigned vector_length_in_bytes) {
   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
          "unsupported");
   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
   assert_different_registers(dst, isrc);
   bool isQ = vector_length_in_bytes == 16;

   BLOCK_COMMENT("neon_reduce_logical {");
     umov(rscratch1, vsrc, isQ ? D : S, 0);
     umov(dst, vsrc, isQ ? D : S, 1);
     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
     switch(bt) {
       case T_BYTE:
         if (isQ) {
           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
         }
         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
         sxtb(dst, dst);
         break;
       case T_SHORT:
         if (isQ) {
           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
         }
         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
         sxth(dst, dst);
         break;
       case T_INT:
         if (isQ) {
           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
         }
         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
         break;
       case T_LONG:
         assert(isQ, "unsupported");
         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
         break;
       default:
         assert(false, "unsupported");
         ShouldNotReachHere();
     }
   BLOCK_COMMENT("} neon_reduce_logical");
 }

 // Vector reduction min/max for integral type with ASIMD instructions.
 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
 // Clobbers: rscratch1, rflags
 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
                                                     Register isrc, FloatRegister vsrc,
                                                     unsigned vector_length_in_bytes,
                                                     FloatRegister vtmp) {
   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
   assert_different_registers(dst, isrc);
   bool isQ = vector_length_in_bytes == 16;
   bool is_min = opc == Op_MinReductionV;

   BLOCK_COMMENT("neon_reduce_minmax_integral {");
     if (bt == T_LONG) {
       assert(vtmp == fnoreg, "should be");
       assert(isQ, "should be");
       umov(rscratch1, vsrc, D, 0);
       cmp(isrc, rscratch1);
       csel(dst, isrc, rscratch1, is_min ? LT : GT);
       umov(rscratch1, vsrc, D, 1);
       cmp(dst, rscratch1);
       csel(dst, dst, rscratch1, is_min ? LT : GT);
     } else {
       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
       if (size == T2S) {
         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
       } else {
         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
       }
       if (bt == T_INT) {
         umov(dst, vtmp, S, 0);
       } else {
         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
       }
       cmpw(dst, isrc);
       cselw(dst, dst, isrc, is_min ? LT : GT);
     }
   BLOCK_COMMENT("} neon_reduce_minmax_integral");
 }

 // Vector reduction for integral type with SVE instruction.
 // Supported operations are Add, And, Or, Xor, Max, Min.
 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
   assert(pg->is_governing(), "This register has to be a governing predicate register");
   assert_different_registers(src1, dst);
   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
   switch (opc) {
     case Op_AddReductionVI: {
       sve_uaddv(tmp, size, pg, src2);
       if (bt == T_BYTE) {
         smov(dst, tmp, size, 0);
         addw(dst, src1, dst, ext::sxtb);
       } else if (bt == T_SHORT) {
         smov(dst, tmp, size, 0);
         addw(dst, src1, dst, ext::sxth);
       } else {
         umov(dst, tmp, size, 0);
         addw(dst, dst, src1);
       }
       break;
     }
     case Op_AddReductionVL: {
       sve_uaddv(tmp, size, pg, src2);
       umov(dst, tmp, size, 0);
       add(dst, dst, src1);
       break;
     }
     case Op_AndReductionV: {
       sve_andv(tmp, size, pg, src2);
       if (bt == T_INT || bt == T_LONG) {
         umov(dst, tmp, size, 0);
       } else {
         smov(dst, tmp, size, 0);
       }
       if (bt == T_LONG) {
         andr(dst, dst, src1);
       } else {
         andw(dst, dst, src1);
       }
       break;
     }
     case Op_OrReductionV: {
       sve_orv(tmp, size, pg, src2);
       if (bt == T_INT || bt == T_LONG) {
         umov(dst, tmp, size, 0);
       } else {
         smov(dst, tmp, size, 0);
       }
       if (bt == T_LONG) {
         orr(dst, dst, src1);
       } else {
         orrw(dst, dst, src1);
       }
       break;
     }
     case Op_XorReductionV: {
       sve_eorv(tmp, size, pg, src2);
       if (bt == T_INT || bt == T_LONG) {
         umov(dst, tmp, size, 0);
       } else {
         smov(dst, tmp, size, 0);
       }
       if (bt == T_LONG) {
         eor(dst, dst, src1);
       } else {
         eorw(dst, dst, src1);
       }
       break;
     }
     case Op_MaxReductionV: {
       sve_smaxv(tmp, size, pg, src2);
       if (bt == T_INT || bt == T_LONG) {
         umov(dst, tmp, size, 0);
       } else {
         smov(dst, tmp, size, 0);
       }
       if (bt == T_LONG) {
         cmp(dst, src1);
         csel(dst, dst, src1, Assembler::GT);
       } else {
         cmpw(dst, src1);
         cselw(dst, dst, src1, Assembler::GT);
       }
       break;
     }
     case Op_MinReductionV: {
       sve_sminv(tmp, size, pg, src2);
       if (bt == T_INT || bt == T_LONG) {
         umov(dst, tmp, size, 0);
       } else {
         smov(dst, tmp, size, 0);
       }
       if (bt == T_LONG) {
         cmp(dst, src1);
         csel(dst, dst, src1, Assembler::LT);
       } else {
         cmpw(dst, src1);
         cselw(dst, dst, src1, Assembler::LT);
       }
       break;
     }
     default:
       assert(false, "unsupported");
       ShouldNotReachHere();
   }

   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
     if (bt == T_BYTE) {
       sxtb(dst, dst);
     } else if (bt == T_SHORT) {
       sxth(dst, dst);
     }
   }
 }

 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
   uint32_t max_vector_length = Matcher::max_vector_size(bt);
   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");

   // Set all elements to false if the input "lane_cnt" is zero.
   if (lane_cnt == 0) {
     sve_pfalse(dst);
     return;
   }

   SIMD_RegVariant size = elemType_to_regVariant(bt);
   assert(size != Q, "invalid size");

   // Set all true if "lane_cnt" equals to the max lane count.
   if (lane_cnt == max_vector_length) {
     sve_ptrue(dst, size, /* ALL */ 0b11111);
     return;
   }

   // Fixed numbers for "ptrue".
   switch(lane_cnt) {
   case 1: /* VL1 */
   case 2: /* VL2 */
   case 3: /* VL3 */
   case 4: /* VL4 */
   case 5: /* VL5 */
   case 6: /* VL6 */
   case 7: /* VL7 */
   case 8: /* VL8 */
     sve_ptrue(dst, size, lane_cnt);
     return;
   case 16:
     sve_ptrue(dst, size, /* VL16 */ 0b01001);
     return;
   case 32:
     sve_ptrue(dst, size, /* VL32 */ 0b01010);
     return;
   case 64:
     sve_ptrue(dst, size, /* VL64 */ 0b01011);
     return;
   case 128:
     sve_ptrue(dst, size, /* VL128 */ 0b01100);
     return;
   case 256:
     sve_ptrue(dst, size, /* VL256 */ 0b01101);
     return;
   default:
     break;
   }

   // Special patterns for "ptrue".
   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
     sve_ptrue(dst, size, /* POW2 */ 0b00000);
   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
   } else {
     // Encode to "whileltw" for the remaining cases.
     mov(rscratch1, lane_cnt);
     sve_whileltw(dst, size, zr, rscratch1);
   }
 }

 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
 // Any remaining elements of dst will be filled with zero.
 // Clobbers: rscratch1
 // Preserves: src, mask
 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
                                            FloatRegister vtmp1, FloatRegister vtmp2,
                                            PRegister pgtmp) {
   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
   assert_different_registers(dst, src, vtmp1, vtmp2);
   assert_different_registers(mask, pgtmp);

   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
   sve_dup(vtmp2, H, 0);

   // Extend lowest half to type INT.
   // dst = 00004444 00003333 00002222 00001111
   sve_uunpklo(dst, S, src);
   // pgtmp = 00000001 00000000 00000001 00000001
   sve_punpklo(pgtmp, mask);
   // Pack the active elements in size of type INT to the right,
   // and fill the remainings with zero.
   // dst = 00000000 00004444 00002222 00001111
   sve_compact(dst, S, dst, pgtmp);
   // Narrow the result back to type SHORT.
   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
   sve_uzp1(dst, H, dst, vtmp2);
   // Count the active elements of lowest half.
   // rscratch1 = 3
   sve_cntp(rscratch1, S, ptrue, pgtmp);

   // Repeat to the highest half.
   // pgtmp = 00000001 00000000 00000000 00000001
   sve_punpkhi(pgtmp, mask);
   // vtmp1 = 00008888 00007777 00006666 00005555
   sve_uunpkhi(vtmp1, S, src);
   // vtmp1 = 00000000 00000000 00008888 00005555
   sve_compact(vtmp1, S, vtmp1, pgtmp);
   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
   sve_uzp1(vtmp1, H, vtmp1, vtmp2);

   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
   // TRUE_CNT is the number of active elements in the compressed low.
   neg(rscratch1, rscratch1);
   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
   sve_index(vtmp2, H, rscratch1, 1);
   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
   sve_tbl(vtmp1, H, vtmp1, vtmp2);

   // Combine the compressed high(after shifted) with the compressed low.
   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
   sve_orr(dst, dst, vtmp1);
 }

 // Clobbers: rscratch1, rscratch2
 // Preserves: src, mask
 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
                                           FloatRegister vtmp1, FloatRegister vtmp2,
                                           FloatRegister vtmp3, FloatRegister vtmp4,
                                           PRegister ptmp, PRegister pgtmp) {
   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
   assert_different_registers(mask, ptmp, pgtmp);
   // Example input:   src   = 88 77 66 55 44 33 22 11
   //                  mask  = 01 00 00 01 01 00 01 01
   // Expected result: dst   = 00 00 00 88 55 44 22 11

   sve_dup(vtmp4, B, 0);
   // Extend lowest half to type SHORT.
   // vtmp1 = 0044 0033 0022 0011
   sve_uunpklo(vtmp1, H, src);
   // ptmp = 0001 0000 0001 0001
   sve_punpklo(ptmp, mask);
   // Count the active elements of lowest half.
   // rscratch2 = 3
   sve_cntp(rscratch2, H, ptrue, ptmp);
   // Pack the active elements in size of type SHORT to the right,
   // and fill the remainings with zero.
   // dst = 0000 0044 0022 0011
   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
   // Narrow the result back to type BYTE.
   // dst = 00 00 00 00 00 44 22 11
   sve_uzp1(dst, B, dst, vtmp4);

   // Repeat to the highest half.
   // ptmp = 0001 0000 0000 0001
   sve_punpkhi(ptmp, mask);
   // vtmp1 = 0088 0077 0066 0055
   sve_uunpkhi(vtmp2, H, src);
   // vtmp1 = 0000 0000 0088 0055
   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);

   sve_dup(vtmp4, B, 0);
   // vtmp1 = 00 00 00 00 00 00 88 55
   sve_uzp1(vtmp1, B, vtmp1, vtmp4);

   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
   // TRUE_CNT is the number of active elements in the compressed low.
   neg(rscratch2, rscratch2);
   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
   sve_index(vtmp2, B, rscratch2, 1);
   // vtmp1 = 00 00 00 88 55 00 00 00
   sve_tbl(vtmp1, B, vtmp1, vtmp2);
   // Combine the compressed high(after shifted) with the compressed low.
   // dst = 00 00 00 88 55 44 22 11
   sve_orr(dst, dst, vtmp1);
 }

 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
   SIMD_Arrangement size = isQ ? T16B : T8B;
   if (bt == T_BYTE) {
     rbit(dst, size, src);
   } else {
     neon_reverse_bytes(dst, src, bt, isQ);
     rbit(dst, size, dst);
   }
 }

 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
   SIMD_Arrangement size = isQ ? T16B : T8B;
   switch (bt) {
     case T_BYTE:
       if (dst != src) {
         orr(dst, size, src, src);
       }
       break;
     case T_SHORT:
       rev16(dst, size, src);
       break;
     case T_INT:
       rev32(dst, size, src);
       break;
     case T_LONG:
       rev64(dst, size, src);
       break;
     default:
       assert(false, "unsupported");
       ShouldNotReachHere();
   }
 }

 // Extract a scalar element from an sve vector at position 'idx'.
 // The input elements in src are expected to be of integral type.
 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
                                              int idx, FloatRegister vtmp) {
   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
     if (bt == T_INT || bt == T_LONG) {
       umov(dst, src, size, idx);
     } else {
       smov(dst, src, size, idx);
     }
   } else {
     sve_orr(vtmp, src, src);
     sve_ext(vtmp, vtmp, idx << size);
     if (bt == T_INT || bt == T_LONG) {
       umov(dst, vtmp, size, 0);
     } else {
       smov(dst, vtmp, size, 0);
     }
   }
 }

 // java.lang.Math::round intrinsics

 // Clobbers: rscratch1, rflags
 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
   switch (T) {
     case T2S:
     case T4S:
       fmovs(tmp1, T, 0.5f);
       mov(rscratch1, jint_cast(0x1.0p23f));
       break;
     case T2D:
       fmovd(tmp1, T, 0.5);
       mov(rscratch1, julong_cast(0x1.0p52));
       break;
     default:
       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
   }
   fadd(tmp1, T, tmp1, src);
   fcvtms(tmp1, T, tmp1);
   // tmp1 = floor(src + 0.5, ties to even)

   fcvtas(dst, T, src);
   // dst = round(src), ties to away

   fneg(tmp3, T, src);
   dup(tmp2, T, rscratch1);
   cm(HS, tmp3, T, tmp3, tmp2);
   // tmp3 is now a set of flags

   bif(dst, T16B, tmp1, tmp3);
   // result in dst
 }

 // Clobbers: rscratch1, rflags
 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
   assert_different_registers(tmp1, tmp2, src, dst);

   switch (T) {
     case S:
       mov(rscratch1, jint_cast(0x1.0p23f));
       break;
     case D:
       mov(rscratch1, julong_cast(0x1.0p52));
       break;
     default:
       assert(T == S || T == D, "invalid register variant");
   }

   sve_frinta(dst, T, ptrue, src);
   // dst = round(src), ties to away

   Label none;

   sve_fneg(tmp1, T, ptrue, src);
   sve_dup(tmp2, T, rscratch1);
   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
   br(EQ, none);
   {
     sve_cpy(tmp1, T, pgtmp, 0.5);
     sve_fadd(tmp1, T, pgtmp, src);
     sve_frintm(dst, T, pgtmp, tmp1);
     // dst = floor(src + 0.5, ties to even)
   }
   bind(none);

   sve_fcvtzs(dst, T, ptrue, dst, T);
   // result in dst
 }

 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
                                            FloatRegister one, SIMD_Arrangement T) {
   assert_different_registers(dst, src, zero, one);
   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");

   facgt(dst, T, src, zero);
   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
 }

 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
     assert_different_registers(dst, src, zero, one, vtmp);
     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");

     sve_orr(vtmp, src, src);
     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
     switch (T) {
     case S:
       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
                                         // on the sign of the float value
       break;
     case D:
       sve_and(vtmp, T, min_jlong);
       sve_orr(vtmp, T, jlong_cast(1.0));
       break;
     default:
       assert(false, "unsupported");
       ShouldNotReachHere();
     }
     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
                                        // Result in dst
 }

 bool C2_MacroAssembler::in_scratch_emit_size() {
   if (ciEnv::current()->task() != nullptr) {
     PhaseOutput* phase_output = Compile::current()->output();
     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
       return true;
     }
   }
   return MacroAssembler::in_scratch_emit_size();
 }