src/hotspot/cpu/riscv/macroAssembler_riscv.hpp - toolchain/jdk/jdk21 - Git at Google

 /*
  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  *
  */

 #ifndef CPU_RISCV_MACROASSEMBLER_RISCV_HPP
 #define CPU_RISCV_MACROASSEMBLER_RISCV_HPP

 #include "asm/assembler.inline.hpp"
 #include "code/vmreg.hpp"
 #include "metaprogramming/enableIf.hpp"
 #include "nativeInst_riscv.hpp"
 #include "oops/compressedOops.hpp"
 #include "utilities/powerOfTwo.hpp"

 // MacroAssembler extends Assembler by frequently used macros.
 //
 // Instructions for which a 'better' code sequence exists depending
 // on arguments should also go in here.

 class MacroAssembler: public Assembler {

  public:
   MacroAssembler(CodeBuffer* code) : Assembler(code) {}

   void safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod);

   // Alignment
   int align(int modulus, int extra_offset = 0);

   static inline void assert_alignment(address pc, int alignment = NativeInstruction::instruction_size) {
     assert(is_aligned(pc, alignment), "bad alignment");
   }

   // nop
   void post_call_nop();

   // Stack frame creation/removal
   // Note that SP must be updated to the right place before saving/restoring RA and FP
   // because signal based thread suspend/resume could happen asynchronously.
   void enter() {
     addi(sp, sp, - 2 * wordSize);
     sd(ra, Address(sp, wordSize));
     sd(fp, Address(sp));
     addi(fp, sp, 2 * wordSize);
   }

   void leave() {
     addi(sp, fp, - 2 * wordSize);
     ld(fp, Address(sp));
     ld(ra, Address(sp, wordSize));
     addi(sp, sp, 2 * wordSize);
   }


   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
   // The pointer will be loaded into the thread register.
   void get_thread(Register thread);

   // Support for VM calls
   //
   // It is imperative that all calls into the VM are handled via the call_VM macros.
   // They make sure that the stack linkage is setup correctly. call_VM's correspond
   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.

   void call_VM(Register oop_result,
                address entry_point,
                bool check_exceptions = true);
   void call_VM(Register oop_result,
                address entry_point,
                Register arg_1,
                bool check_exceptions = true);
   void call_VM(Register oop_result,
                address entry_point,
                Register arg_1, Register arg_2,
                bool check_exceptions = true);
   void call_VM(Register oop_result,
                address entry_point,
                Register arg_1, Register arg_2, Register arg_3,
                bool check_exceptions = true);

   // Overloadings with last_Java_sp
   void call_VM(Register oop_result,
                Register last_java_sp,
                address entry_point,
                int number_of_arguments = 0,
                bool check_exceptions = true);
   void call_VM(Register oop_result,
                Register last_java_sp,
                address entry_point,
                Register arg_1,
                bool check_exceptions = true);
   void call_VM(Register oop_result,
                Register last_java_sp,
                address entry_point,
                Register arg_1, Register arg_2,
                bool check_exceptions = true);
   void call_VM(Register oop_result,
                Register last_java_sp,
                address entry_point,
                Register arg_1, Register arg_2, Register arg_3,
                bool check_exceptions = true);

   void get_vm_result(Register oop_result, Register java_thread);
   void get_vm_result_2(Register metadata_result, Register java_thread);

   // These always tightly bind to MacroAssembler::call_VM_leaf_base
   // bypassing the virtual implementation
   void call_VM_leaf(address entry_point,
                     int number_of_arguments = 0);
   void call_VM_leaf(address entry_point,
                     Register arg_0);
   void call_VM_leaf(address entry_point,
                     Register arg_0, Register arg_1);
   void call_VM_leaf(address entry_point,
                     Register arg_0, Register arg_1, Register arg_2);

   // These always tightly bind to MacroAssembler::call_VM_base
   // bypassing the virtual implementation
   void super_call_VM_leaf(address entry_point, Register arg_0);
   void super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1);
   void super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2);
   void super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3);

   // last Java Frame (fills frame anchor)
   void set_last_Java_frame(Register last_java_sp, Register last_java_fp, address last_java_pc, Register tmp);
   void set_last_Java_frame(Register last_java_sp, Register last_java_fp, Label &last_java_pc, Register tmp);
   void set_last_Java_frame(Register last_java_sp, Register last_java_fp, Register last_java_pc, Register tmp);

   // thread in the default location (xthread)
   void reset_last_Java_frame(bool clear_fp);

   virtual void call_VM_leaf_base(
     address entry_point,                // the entry point
     int     number_of_arguments,        // the number of arguments to pop after the call
     Label*  retaddr = nullptr
   );

   virtual void call_VM_leaf_base(
     address entry_point,                // the entry point
     int     number_of_arguments,        // the number of arguments to pop after the call
     Label&  retaddr) {
     call_VM_leaf_base(entry_point, number_of_arguments, &retaddr);
   }

   virtual void call_VM_base(           // returns the register containing the thread upon return
     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
     Register java_thread,              // the thread if computed before     ; use noreg otherwise
     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
     address  entry_point,              // the entry point
     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
     bool     check_exceptions          // whether to check for pending exceptions after return
   );

   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions);

   virtual void check_and_handle_earlyret(Register java_thread);
   virtual void check_and_handle_popframe(Register java_thread);

   void resolve_weak_handle(Register result, Register tmp1, Register tmp2);
   void resolve_oop_handle(Register result, Register tmp1, Register tmp2);
   void resolve_jobject(Register value, Register tmp1, Register tmp2);
   void resolve_global_jobject(Register value, Register tmp1, Register tmp2);

   void movoop(Register dst, jobject obj);
   void mov_metadata(Register dst, Metadata* obj);
   void bang_stack_size(Register size, Register tmp);
   void set_narrow_oop(Register dst, jobject obj);
   void set_narrow_klass(Register dst, Klass* k);

   void load_mirror(Register dst, Register method, Register tmp1, Register tmp2);
   void access_load_at(BasicType type, DecoratorSet decorators, Register dst,
                       Address src, Register tmp1, Register tmp2);
   void access_store_at(BasicType type, DecoratorSet decorators, Address dst,
                        Register val, Register tmp1, Register tmp2, Register tmp3);
   void load_klass(Register dst, Register src, Register tmp = t0);
   void store_klass(Register dst, Register src, Register tmp = t0);
   void cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L);

   void encode_klass_not_null(Register r, Register tmp = t0);
   void decode_klass_not_null(Register r, Register tmp = t0);
   void encode_klass_not_null(Register dst, Register src, Register tmp);
   void decode_klass_not_null(Register dst, Register src, Register tmp);
   void decode_heap_oop_not_null(Register r);
   void decode_heap_oop_not_null(Register dst, Register src);
   void decode_heap_oop(Register d, Register s);
   void decode_heap_oop(Register r) { decode_heap_oop(r, r); }
   void encode_heap_oop(Register d, Register s);
   void encode_heap_oop(Register r) { encode_heap_oop(r, r); };
   void load_heap_oop(Register dst, Address src, Register tmp1,
                      Register tmp2, DecoratorSet decorators = 0);
   void load_heap_oop_not_null(Register dst, Address src, Register tmp1,
                               Register tmp2, DecoratorSet decorators = 0);
   void store_heap_oop(Address dst, Register val, Register tmp1,
                       Register tmp2, Register tmp3, DecoratorSet decorators = 0);

   void store_klass_gap(Register dst, Register src);

   // currently unimplemented
   // Used for storing null. All other oop constants should be
   // stored using routines that take a jobject.
   void store_heap_oop_null(Address dst);

   // This dummy is to prevent a call to store_heap_oop from
   // converting a zero (linked null) into a Register by giving
   // the compiler two choices it can't resolve

   void store_heap_oop(Address dst, void* dummy);

   // Support for null-checks
   //
   // Generates code that causes a null OS exception if the content of reg is null.
   // If the accessed location is M[reg + offset] and the offset is known, provide the
   // offset. No explicit code generateion is needed if the offset is within a certain
   // range (0 <= offset <= page_size).

   virtual void null_check(Register reg, int offset = -1);
   static bool needs_explicit_null_check(intptr_t offset);
   static bool uses_implicit_null_check(void* address);

   // idiv variant which deals with MINLONG as dividend and -1 as divisor
   int corrected_idivl(Register result, Register rs1, Register rs2,
                       bool want_remainder);
   int corrected_idivq(Register result, Register rs1, Register rs2,
                       bool want_remainder);

   // interface method calling
   void lookup_interface_method(Register recv_klass,
                                Register intf_klass,
                                RegisterOrConstant itable_index,
                                Register method_result,
                                Register scan_tmp,
                                Label& no_such_interface,
                                bool return_method = true);

   // virtual method calling
   // n.n. x86 allows RegisterOrConstant for vtable_index
   void lookup_virtual_method(Register recv_klass,
                              RegisterOrConstant vtable_index,
                              Register method_result);

   // Form an address from base + offset in Rd. Rd my or may not
   // actually be used: you must use the Address that is returned. It
   // is up to you to ensure that the shift provided matches the size
   // of your data.
   Address form_address(Register Rd, Register base, int64_t byte_offset);

   // Sometimes we get misaligned loads and stores, usually from Unsafe
   // accesses, and these can exceed the offset range.
   Address legitimize_address(Register Rd, const Address &adr) {
     if (adr.getMode() == Address::base_plus_offset) {
       if (!is_simm12(adr.offset())) {
         return form_address(Rd, adr.base(), adr.offset());
       }
     }
     return adr;
   }

   // allocation
   void tlab_allocate(
     Register obj,                   // result: pointer to object after successful allocation
     Register var_size_in_bytes,     // object size in bytes if unknown at compile time; invalid otherwise
     int      con_size_in_bytes,     // object size in bytes if   known at compile time
     Register tmp1,                  // temp register
     Register tmp2,                  // temp register
     Label&   slow_case,             // continuation point of fast allocation fails
     bool     is_far = false
   );

   // Test sub_klass against super_klass, with fast and slow paths.

   // The fast path produces a tri-state answer: yes / no / maybe-slow.
   // One of the three labels can be null, meaning take the fall-through.
   // If super_check_offset is -1, the value is loaded up from super_klass.
   // No registers are killed, except tmp_reg
   void check_klass_subtype_fast_path(Register sub_klass,
                                      Register super_klass,
                                      Register tmp_reg,
                                      Label* L_success,
                                      Label* L_failure,
                                      Label* L_slow_path,
                                      Register super_check_offset = noreg);

   // The reset of the type check; must be wired to a corresponding fast path.
   // It does not repeat the fast path logic, so don't use it standalone.
   // The tmp1_reg and tmp2_reg can be noreg, if no temps are available.
   // Updates the sub's secondary super cache as necessary.
   void check_klass_subtype_slow_path(Register sub_klass,
                                      Register super_klass,
                                      Register tmp1_reg,
                                      Register tmp2_reg,
                                      Label* L_success,
                                      Label* L_failure);

   void check_klass_subtype(Register sub_klass,
                            Register super_klass,
                            Register tmp_reg,
                            Label& L_success);

   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);

   // only if +VerifyOops
   void _verify_oop(Register reg, const char* s, const char* file, int line);
   void _verify_oop_addr(Address addr, const char* s, const char* file, int line);

   void _verify_oop_checked(Register reg, const char* s, const char* file, int line) {
     if (VerifyOops) {
       _verify_oop(reg, s, file, line);
     }
   }
   void _verify_oop_addr_checked(Address reg, const char* s, const char* file, int line) {
     if (VerifyOops) {
       _verify_oop_addr(reg, s, file, line);
     }
   }

   void _verify_method_ptr(Register reg, const char* msg, const char* file, int line) {}
   void _verify_klass_ptr(Register reg, const char* msg, const char* file, int line) {}

 #define verify_oop(reg) _verify_oop_checked(reg, "broken oop " #reg, __FILE__, __LINE__)
 #define verify_oop_msg(reg, msg) _verify_oop_checked(reg, "broken oop " #reg ", " #msg, __FILE__, __LINE__)
 #define verify_oop_addr(addr) _verify_oop_addr_checked(addr, "broken oop addr " #addr, __FILE__, __LINE__)
 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 #define verify_klass_ptr(reg) _verify_method_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)

   // A more convenient access to fence for our purposes
   // We used four bit to indicate the read and write bits in the predecessors and successors,
   // and extended i for r, o for w if UseConservativeFence enabled.
   enum Membar_mask_bits {
     StoreStore = 0b0101,               // (pred = ow   + succ =   ow)
     LoadStore  = 0b1001,               // (pred = ir   + succ =   ow)
     StoreLoad  = 0b0110,               // (pred = ow   + succ =   ir)
     LoadLoad   = 0b1010,               // (pred = ir   + succ =   ir)
     AnyAny     = LoadStore | StoreLoad // (pred = iorw + succ = iorw)
   };

   void membar(uint32_t order_constraint);

   static void membar_mask_to_pred_succ(uint32_t order_constraint,
                                        uint32_t& predecessor, uint32_t& successor) {
     predecessor = (order_constraint >> 2) & 0x3;
     successor = order_constraint & 0x3;

     // extend rw -> iorw:
     // 01(w) -> 0101(ow)
     // 10(r) -> 1010(ir)
     // 11(rw)-> 1111(iorw)
     if (UseConservativeFence) {
       predecessor |= predecessor << 2;
       successor |= successor << 2;
     }
   }

   static int pred_succ_to_membar_mask(uint32_t predecessor, uint32_t successor) {
     return ((predecessor & 0x3) << 2) | (successor & 0x3);
   }

   void pause() {
     fence(w, 0);
   }

   // prints msg, dumps registers and stops execution
   void stop(const char* msg);

   static void debug64(char* msg, int64_t pc, int64_t regs[]);

   void unimplemented(const char* what = "");

   void should_not_reach_here() { stop("should not reach here"); }

   static address target_addr_for_insn(address insn_addr);

   // Required platform-specific helpers for Label::patch_instructions.
   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
   static int pd_patch_instruction_size(address branch, address target);
   static void pd_patch_instruction(address branch, address target, const char* file = nullptr, int line = 0) {
     pd_patch_instruction_size(branch, target);
   }
   static address pd_call_destination(address branch) {
     return target_addr_for_insn(branch);
   }

   static int patch_oop(address insn_addr, address o);

   static address get_target_of_li32(address insn_addr);
   static int patch_imm_in_li32(address branch, int32_t target);

   // Return whether code is emitted to a scratch blob.
   virtual bool in_scratch_emit_size() {
     return false;
   }

   address emit_trampoline_stub(int insts_call_instruction_offset, address target);
   static int max_trampoline_stub_size();
   void emit_static_call_stub();
   static int static_call_stub_size();

   // The following 4 methods return the offset of the appropriate move instruction

   // Support for fast byte/short loading with zero extension (depending on particular CPU)
   int load_unsigned_byte(Register dst, Address src);
   int load_unsigned_short(Register dst, Address src);

   // Support for fast byte/short loading with sign extension (depending on particular CPU)
   int load_signed_byte(Register dst, Address src);
   int load_signed_short(Register dst, Address src);

   // Load and store values by size and signed-ness
   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed);
   void store_sized_value(Address dst, Register src, size_t size_in_bytes);

   // Misaligned loads, will use the best way, according to the AvoidUnalignedAccess flag
   void load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity = 1);
   void load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity = 1);
   void load_long_misaligned(Register dst, Address src, Register tmp, int granularity = 1);

  public:
   // Standard pseudo instructions
   inline void nop() {
     addi(x0, x0, 0);
   }

   inline void mv(Register Rd, Register Rs) {
     if (Rd != Rs) {
       addi(Rd, Rs, 0);
     }
   }

   inline void notr(Register Rd, Register Rs) {
     xori(Rd, Rs, -1);
   }

   inline void neg(Register Rd, Register Rs) {
     sub(Rd, x0, Rs);
   }

   inline void negw(Register Rd, Register Rs) {
     subw(Rd, x0, Rs);
   }

   inline void sext_w(Register Rd, Register Rs) {
     addiw(Rd, Rs, 0);
   }

   inline void zext_b(Register Rd, Register Rs) {
     andi(Rd, Rs, 0xFF);
   }

   inline void seqz(Register Rd, Register Rs) {
     sltiu(Rd, Rs, 1);
   }

   inline void snez(Register Rd, Register Rs) {
     sltu(Rd, x0, Rs);
   }

   inline void sltz(Register Rd, Register Rs) {
     slt(Rd, Rs, x0);
   }

   inline void sgtz(Register Rd, Register Rs) {
     slt(Rd, x0, Rs);
   }

   // Bit-manipulation extension pseudo instructions
   // zero extend word
   inline void zext_w(Register Rd, Register Rs) {
     add_uw(Rd, Rs, zr);
   }

   // Floating-point data-processing pseudo instructions
   inline void fmv_s(FloatRegister Rd, FloatRegister Rs) {
     if (Rd != Rs) {
       fsgnj_s(Rd, Rs, Rs);
     }
   }

   inline void fabs_s(FloatRegister Rd, FloatRegister Rs) {
     fsgnjx_s(Rd, Rs, Rs);
   }

   inline void fneg_s(FloatRegister Rd, FloatRegister Rs) {
     fsgnjn_s(Rd, Rs, Rs);
   }

   inline void fmv_d(FloatRegister Rd, FloatRegister Rs) {
     if (Rd != Rs) {
       fsgnj_d(Rd, Rs, Rs);
     }
   }

   inline void fabs_d(FloatRegister Rd, FloatRegister Rs) {
     fsgnjx_d(Rd, Rs, Rs);
   }

   inline void fneg_d(FloatRegister Rd, FloatRegister Rs) {
     fsgnjn_d(Rd, Rs, Rs);
   }

   // Control and status pseudo instructions
   void rdinstret(Register Rd);                  // read instruction-retired counter
   void rdcycle(Register Rd);                    // read cycle counter
   void rdtime(Register Rd);                     // read time
   void csrr(Register Rd, unsigned csr);         // read csr
   void csrw(unsigned csr, Register Rs);         // write csr
   void csrs(unsigned csr, Register Rs);         // set bits in csr
   void csrc(unsigned csr, Register Rs);         // clear bits in csr
   void csrwi(unsigned csr, unsigned imm);
   void csrsi(unsigned csr, unsigned imm);
   void csrci(unsigned csr, unsigned imm);
   void frcsr(Register Rd);                      // read float-point csr
   void fscsr(Register Rd, Register Rs);         // swap float-point csr
   void fscsr(Register Rs);                      // write float-point csr
   void frrm(Register Rd);                       // read float-point rounding mode
   void fsrm(Register Rd, Register Rs);          // swap float-point rounding mode
   void fsrm(Register Rs);                       // write float-point rounding mode
   void fsrmi(Register Rd, unsigned imm);
   void fsrmi(unsigned imm);
   void frflags(Register Rd);                    // read float-point exception flags
   void fsflags(Register Rd, Register Rs);       // swap float-point exception flags
   void fsflags(Register Rs);                    // write float-point exception flags
   void fsflagsi(Register Rd, unsigned imm);
   void fsflagsi(unsigned imm);

   // Control transfer pseudo instructions
   void beqz(Register Rs, const address dest);
   void bnez(Register Rs, const address dest);
   void blez(Register Rs, const address dest);
   void bgez(Register Rs, const address dest);
   void bltz(Register Rs, const address dest);
   void bgtz(Register Rs, const address dest);

   void j(Label &l, Register temp = t0);
   void j(const address dest, Register temp = t0);
   void j(const Address &adr, Register temp = t0);
   void jal(Label &l, Register temp = t0);
   void jal(const address dest, Register temp = t0);
   void jal(const Address &adr, Register temp = t0);
   void jal(Register Rd, Label &L, Register temp = t0);
   void jal(Register Rd, const address dest, Register temp = t0);

   //label
   void beqz(Register Rs, Label &l, bool is_far = false);
   void bnez(Register Rs, Label &l, bool is_far = false);
   void blez(Register Rs, Label &l, bool is_far = false);
   void bgez(Register Rs, Label &l, bool is_far = false);
   void bltz(Register Rs, Label &l, bool is_far = false);
   void bgtz(Register Rs, Label &l, bool is_far = false);

   void beq (Register Rs1, Register Rs2, Label &L, bool is_far = false);
   void bne (Register Rs1, Register Rs2, Label &L, bool is_far = false);
   void blt (Register Rs1, Register Rs2, Label &L, bool is_far = false);
   void bge (Register Rs1, Register Rs2, Label &L, bool is_far = false);
   void bltu(Register Rs1, Register Rs2, Label &L, bool is_far = false);
   void bgeu(Register Rs1, Register Rs2, Label &L, bool is_far = false);

   void bgt (Register Rs, Register Rt, const address dest);
   void ble (Register Rs, Register Rt, const address dest);
   void bgtu(Register Rs, Register Rt, const address dest);
   void bleu(Register Rs, Register Rt, const address dest);

   void bgt (Register Rs, Register Rt, Label &l, bool is_far = false);
   void ble (Register Rs, Register Rt, Label &l, bool is_far = false);
   void bgtu(Register Rs, Register Rt, Label &l, bool is_far = false);
   void bleu(Register Rs, Register Rt, Label &l, bool is_far = false);

 #define INSN_ENTRY_RELOC(result_type, header)                               \
   result_type header {                                                      \
     guarantee(rtype == relocInfo::internal_word_type,                       \
               "only internal_word_type relocs make sense here");            \
     relocate(InternalAddress(dest).rspec());                                \
     IncompressibleRegion ir(this);  /* relocations */

 #define INSN(NAME)                                                                                       \
   void NAME(Register Rs1, Register Rs2, const address dest) {                                            \
     assert_cond(dest != nullptr);                                                                        \
     int64_t offset = dest - pc();                                                                        \
     guarantee(is_simm13(offset) && is_even(offset),                                                      \
               "offset is invalid: is_simm_13: %s offset: " INT64_FORMAT,                                 \
               BOOL_TO_STR(is_simm13(offset)), offset);                                                   \
     Assembler::NAME(Rs1, Rs2, offset);                                                                   \
   }                                                                                                      \
   INSN_ENTRY_RELOC(void, NAME(Register Rs1, Register Rs2, address dest, relocInfo::relocType rtype))     \
     NAME(Rs1, Rs2, dest);                                                                                \
   }

   INSN(beq);
   INSN(bne);
   INSN(bge);
   INSN(bgeu);
   INSN(blt);
   INSN(bltu);

 #undef INSN

 #undef INSN_ENTRY_RELOC

   void float_beq(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void float_bne(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void float_ble(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void float_bge(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void float_blt(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void float_bgt(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);

   void double_beq(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void double_bne(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void double_ble(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void double_bge(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void double_blt(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);
   void double_bgt(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far = false, bool is_unordered = false);

 private:
   int push_reg(unsigned int bitset, Register stack);
   int pop_reg(unsigned int bitset, Register stack);
   int push_fp(unsigned int bitset, Register stack);
   int pop_fp(unsigned int bitset, Register stack);
 #ifdef COMPILER2
   int push_v(unsigned int bitset, Register stack);
   int pop_v(unsigned int bitset, Register stack);
 #endif // COMPILER2

   // The signed 20-bit upper imm can materialize at most negative 0xF...F80000000, two G.
   // The following signed 12-bit imm can at max subtract 0x800, two K, from that previously loaded two G.
   bool is_valid_32bit_offset(int64_t x) {
     constexpr int64_t twoG = (2 * G);
     constexpr int64_t twoK = (2 * K);
     return x < (twoG - twoK) && x >= (-twoG - twoK);
   }

 public:
   void push_reg(Register Rs);
   void pop_reg(Register Rd);
   void push_reg(RegSet regs, Register stack) { if (regs.bits()) push_reg(regs.bits(), stack); }
   void pop_reg(RegSet regs, Register stack)  { if (regs.bits()) pop_reg(regs.bits(), stack); }
   void push_fp(FloatRegSet regs, Register stack) { if (regs.bits()) push_fp(regs.bits(), stack); }
   void pop_fp(FloatRegSet regs, Register stack)  { if (regs.bits()) pop_fp(regs.bits(), stack); }
 #ifdef COMPILER2
   void push_v(VectorRegSet regs, Register stack) { if (regs.bits()) push_v(regs.bits(), stack); }
   void pop_v(VectorRegSet regs, Register stack)  { if (regs.bits()) pop_v(regs.bits(), stack); }
 #endif // COMPILER2

   // Push and pop everything that might be clobbered by a native
   // runtime call except t0 and t1. (They are always
   // temporary registers, so we don't have to protect them.)
   // Additional registers can be excluded in a passed RegSet.
   void push_call_clobbered_registers_except(RegSet exclude);
   void pop_call_clobbered_registers_except(RegSet exclude);

   void push_call_clobbered_registers() {
     push_call_clobbered_registers_except(RegSet());
   }
   void pop_call_clobbered_registers() {
     pop_call_clobbered_registers_except(RegSet());
   }

   void push_CPU_state(bool save_vectors = false, int vector_size_in_bytes = 0);
   void pop_CPU_state(bool restore_vectors = false, int vector_size_in_bytes = 0);

   void push_cont_fastpath(Register java_thread);
   void pop_cont_fastpath(Register java_thread);

   // if heap base register is used - reinit it with the correct value
   void reinit_heapbase();

   void bind(Label& L) {
     Assembler::bind(L);
     // fences across basic blocks should not be merged
     code()->clear_last_insn();
   }

   typedef void (MacroAssembler::* compare_and_branch_insn)(Register Rs1, Register Rs2, const address dest);
   typedef void (MacroAssembler::* compare_and_branch_label_insn)(Register Rs1, Register Rs2, Label &L, bool is_far);
   typedef void (MacroAssembler::* jal_jalr_insn)(Register Rt, address dest);
   typedef void (MacroAssembler::* load_insn_by_temp)(Register Rt, address dest, Register temp);

   void wrap_label(Register r, Label &L, Register t, load_insn_by_temp insn);
   void wrap_label(Register r, Label &L, jal_jalr_insn insn);
   void wrap_label(Register r1, Register r2, Label &L,
                   compare_and_branch_insn insn,
                   compare_and_branch_label_insn neg_insn, bool is_far = false);

   void la(Register Rd, Label &label);
   void la(Register Rd, const address dest);
   void la(Register Rd, const Address &adr);

   void li16u(Register Rd, uint16_t imm);
   void li32(Register Rd, int32_t imm);
   void li64(Register Rd, int64_t imm);
   void li  (Register Rd, int64_t imm);  // optimized load immediate

   // mv
   void mv(Register Rd, address addr)                  { li(Rd, (int64_t)addr); }
   void mv(Register Rd, address addr, int32_t &offset) {
     // Split address into a lower 12-bit sign-extended offset and the remainder,
     // so that the offset could be encoded in jalr or load/store instruction.
     offset = ((int32_t)(int64_t)addr << 20) >> 20;
     li(Rd, (int64_t)addr - offset);
   }

   template<typename T, ENABLE_IF(std::is_integral<T>::value)>
   inline void mv(Register Rd, T o)                    { li(Rd, (int64_t)o); }

   void mv(Register Rd, Address dest) {
     assert(dest.getMode() == Address::literal, "Address mode should be Address::literal");
     relocate(dest.rspec(), [&] {
       movptr(Rd, dest.target());
     });
   }

   void mv(Register Rd, RegisterOrConstant src) {
     if (src.is_register()) {
       mv(Rd, src.as_register());
     } else {
       mv(Rd, src.as_constant());
     }
   }

   void movptr(Register Rd, address addr, int32_t &offset);

   void movptr(Register Rd, address addr) {
     int offset = 0;
     movptr(Rd, addr, offset);
     addi(Rd, Rd, offset);
   }

   inline void movptr(Register Rd, uintptr_t imm64) {
     movptr(Rd, (address)imm64);
   }

   // arith
   void add (Register Rd, Register Rn, int64_t increment, Register temp = t0);
   void addw(Register Rd, Register Rn, int32_t increment, Register temp = t0);
   void sub (Register Rd, Register Rn, int64_t decrement, Register temp = t0);
   void subw(Register Rd, Register Rn, int32_t decrement, Register temp = t0);

 #define INSN(NAME)                                               \
   inline void NAME(Register Rd, Register Rs1, Register Rs2) {    \
     Assembler::NAME(Rd, Rs1, Rs2);                               \
   }

   INSN(add);
   INSN(addw);
   INSN(sub);
   INSN(subw);

 #undef INSN

   // logic
   void andrw(Register Rd, Register Rs1, Register Rs2);
   void orrw(Register Rd, Register Rs1, Register Rs2);
   void xorrw(Register Rd, Register Rs1, Register Rs2);

   // logic with negate
   void andn(Register Rd, Register Rs1, Register Rs2);
   void orn(Register Rd, Register Rs1, Register Rs2);

   // revb
   void revb_h_h(Register Rd, Register Rs, Register tmp = t0);                           // reverse bytes in halfword in lower 16 bits, sign-extend
   void revb_w_w(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2 = t1);      // reverse bytes in lower word, sign-extend
   void revb_h_h_u(Register Rd, Register Rs, Register tmp = t0);                         // reverse bytes in halfword in lower 16 bits, zero-extend
   void revb_h_w_u(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2 = t1);    // reverse bytes in halfwords in lower 32 bits, zero-extend
   void revb_h_helper(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2= t1);  // reverse bytes in upper 16 bits (48:63) and move to lower
   void revb_h(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2= t1);         // reverse bytes in each halfword
   void revb_w(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2= t1);         // reverse bytes in each word
   void revb(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2 = t1);          // reverse bytes in doubleword

   void ror_imm(Register dst, Register src, uint32_t shift, Register tmp = t0);
   void rolw_imm(Register dst, Register src, uint32_t, Register tmp = t0);
   void andi(Register Rd, Register Rn, int64_t imm, Register tmp = t0);
   void orptr(Address adr, RegisterOrConstant src, Register tmp1 = t0, Register tmp2 = t1);

 // Load and Store Instructions
 #define INSN_ENTRY_RELOC(result_type, header)                               \
   result_type header {                                                      \
     guarantee(rtype == relocInfo::internal_word_type,                       \
               "only internal_word_type relocs make sense here");            \
     relocate(InternalAddress(dest).rspec());                                \
     IncompressibleRegion ir(this);  /* relocations */

 #define INSN(NAME)                                                                                 \
   void NAME(Register Rd, address dest) {                                                           \
     assert_cond(dest != nullptr);                                                                  \
     int64_t distance = dest - pc();                                                                \
     if (is_valid_32bit_offset(distance)) {                                                         \
       auipc(Rd, (int32_t)distance + 0x800);                                                        \
       Assembler::NAME(Rd, Rd, ((int32_t)distance << 20) >> 20);                                    \
     } else {                                                                                       \
       int32_t offset = 0;                                                                          \
       movptr(Rd, dest, offset);                                                                    \
       Assembler::NAME(Rd, Rd, offset);                                                             \
     }                                                                                              \
   }                                                                                                \
   INSN_ENTRY_RELOC(void, NAME(Register Rd, address dest, relocInfo::relocType rtype))              \
     NAME(Rd, dest);                                                                                \
   }                                                                                                \
   void NAME(Register Rd, const Address &adr, Register temp = t0) {                                 \
     switch (adr.getMode()) {                                                                       \
       case Address::literal: {                                                                     \
         relocate(adr.rspec(), [&] {                                                                \
           NAME(Rd, adr.target());                                                                  \
         });                                                                                        \
         break;                                                                                     \
       }                                                                                            \
       case Address::base_plus_offset: {                                                            \
         if (is_simm12(adr.offset())) {                                                             \
           Assembler::NAME(Rd, adr.base(), adr.offset());                                           \
         } else {                                                                                   \
           int32_t offset = ((int32_t)adr.offset() << 20) >> 20;                                    \
           if (Rd == adr.base()) {                                                                  \
             la(temp, Address(adr.base(), adr.offset() - offset));                                  \
             Assembler::NAME(Rd, temp, offset);                                                     \
           } else {                                                                                 \
             la(Rd, Address(adr.base(), adr.offset() - offset));                                    \
             Assembler::NAME(Rd, Rd, offset);                                                       \
           }                                                                                        \
         }                                                                                          \
         break;                                                                                     \
       }                                                                                            \
       default:                                                                                     \
         ShouldNotReachHere();                                                                      \
     }                                                                                              \
   }                                                                                                \
   void NAME(Register Rd, Label &L) {                                                               \
     wrap_label(Rd, L, &MacroAssembler::NAME);                                                      \
   }

   INSN(lb);
   INSN(lbu);
   INSN(lh);
   INSN(lhu);
   INSN(lw);
   INSN(lwu);
   INSN(ld);

 #undef INSN

 #define INSN(NAME)                                                                                 \
   void NAME(FloatRegister Rd, address dest, Register temp = t0) {                                  \
     assert_cond(dest != nullptr);                                                                  \
     int64_t distance = dest - pc();                                                                \
     if (is_valid_32bit_offset(distance)) {                                                         \
       auipc(temp, (int32_t)distance + 0x800);                                                      \
       Assembler::NAME(Rd, temp, ((int32_t)distance << 20) >> 20);                                  \
     } else {                                                                                       \
       int32_t offset = 0;                                                                          \
       movptr(temp, dest, offset);                                                                  \
       Assembler::NAME(Rd, temp, offset);                                                           \
     }                                                                                              \
   }                                                                                                \
   INSN_ENTRY_RELOC(void, NAME(FloatRegister Rd, address dest,                                      \
                               relocInfo::relocType rtype, Register temp = t0))                     \
     NAME(Rd, dest, temp);                                                                          \
   }                                                                                                \
   void NAME(FloatRegister Rd, const Address &adr, Register temp = t0) {                            \
     switch (adr.getMode()) {                                                                       \
       case Address::literal: {                                                                     \
         relocate(adr.rspec(), [&] {                                                                \
           NAME(Rd, adr.target(), temp);                                                            \
         });                                                                                        \
         break;                                                                                     \
       }                                                                                            \
       case Address::base_plus_offset: {                                                            \
         if (is_simm12(adr.offset())) {                                                             \
           Assembler::NAME(Rd, adr.base(), adr.offset());                                           \
         } else {                                                                                   \
           int32_t offset = ((int32_t)adr.offset() << 20) >> 20;                                    \
           la(temp, Address(adr.base(), adr.offset() - offset));                                    \
           Assembler::NAME(Rd, temp, offset);                                                       \
         }                                                                                          \
         break;                                                                                     \
       }                                                                                            \
       default:                                                                                     \
         ShouldNotReachHere();                                                                      \
     }                                                                                              \
   }

   INSN(flw);
   INSN(fld);

 #undef INSN

 #define INSN(NAME, REGISTER)                                                                       \
   INSN_ENTRY_RELOC(void, NAME(REGISTER Rs, address dest,                                           \
                               relocInfo::relocType rtype, Register temp = t0))                     \
     NAME(Rs, dest, temp);                                                                          \
   }

   INSN(sb,  Register);
   INSN(sh,  Register);
   INSN(sw,  Register);
   INSN(sd,  Register);
   INSN(fsw, FloatRegister);
   INSN(fsd, FloatRegister);

 #undef INSN

 #define INSN(NAME)                                                                                 \
   void NAME(Register Rs, address dest, Register temp = t0) {                                       \
     assert_cond(dest != nullptr);                                                                  \
     assert_different_registers(Rs, temp);                                                          \
     int64_t distance = dest - pc();                                                                \
     if (is_valid_32bit_offset(distance)) {                                                         \
       auipc(temp, (int32_t)distance + 0x800);                                                      \
       Assembler::NAME(Rs, temp, ((int32_t)distance << 20) >> 20);                                  \
     } else {                                                                                       \
       int32_t offset = 0;                                                                          \
       movptr(temp, dest, offset);                                                                  \
       Assembler::NAME(Rs, temp, offset);                                                           \
     }                                                                                              \
   }                                                                                                \
   void NAME(Register Rs, const Address &adr, Register temp = t0) {                                 \
     switch (adr.getMode()) {                                                                       \
       case Address::literal: {                                                                     \
         assert_different_registers(Rs, temp);                                                      \
         relocate(adr.rspec(), [&] {                                                                \
           NAME(Rs, adr.target(), temp);                                                            \
         });                                                                                        \
         break;                                                                                     \
       }                                                                                            \
       case Address::base_plus_offset: {                                                            \
         if (is_simm12(adr.offset())) {                                                             \
           Assembler::NAME(Rs, adr.base(), adr.offset());                                           \
         } else {                                                                                   \
           assert_different_registers(Rs, temp);                                                    \
           int32_t offset = ((int32_t)adr.offset() << 20) >> 20;                                    \
           la(temp, Address(adr.base(), adr.offset() - offset));                                    \
           Assembler::NAME(Rs, temp, offset);                                                       \
         }                                                                                          \
         break;                                                                                     \
       }                                                                                            \
       default:                                                                                     \
         ShouldNotReachHere();                                                                      \
     }                                                                                              \
   }

   INSN(sb);
   INSN(sh);
   INSN(sw);
   INSN(sd);

 #undef INSN

 #define INSN(NAME)                                                                                 \
   void NAME(FloatRegister Rs, address dest, Register temp = t0) {                                  \
     assert_cond(dest != nullptr);                                                                  \
     int64_t distance = dest - pc();                                                                \
     if (is_valid_32bit_offset(distance)) {                                                         \
       auipc(temp, (int32_t)distance + 0x800);                                                      \
       Assembler::NAME(Rs, temp, ((int32_t)distance << 20) >> 20);                                  \
     } else {                                                                                       \
       int32_t offset = 0;                                                                          \
       movptr(temp, dest, offset);                                                                  \
       Assembler::NAME(Rs, temp, offset);                                                           \
     }                                                                                              \
   }                                                                                                \
   void NAME(FloatRegister Rs, const Address &adr, Register temp = t0) {                            \
     switch (adr.getMode()) {                                                                       \
       case Address::literal: {                                                                     \
         relocate(adr.rspec(), [&] {                                                                \
           NAME(Rs, adr.target(), temp);                                                            \
         });                                                                                        \
         break;                                                                                     \
       }                                                                                            \
       case Address::base_plus_offset: {                                                            \
         if (is_simm12(adr.offset())) {                                                             \
           Assembler::NAME(Rs, adr.base(), adr.offset());                                           \
         } else {                                                                                   \
           int32_t offset = ((int32_t)adr.offset() << 20) >> 20;                                    \
           la(temp, Address(adr.base(), adr.offset() - offset));                                    \
           Assembler::NAME(Rs, temp, offset);                                                       \
         }                                                                                          \
         break;                                                                                     \
       }                                                                                            \
       default:                                                                                     \
         ShouldNotReachHere();                                                                      \
     }                                                                                              \
   }

   INSN(fsw);
   INSN(fsd);

 #undef INSN

 #undef INSN_ENTRY_RELOC

   void cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, Label &succeed, Label *fail);
   void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, Label &succeed, Label *fail);
   void cmpxchg(Register addr, Register expected,
                Register new_val,
                enum operand_size size,
                Assembler::Aqrl acquire, Assembler::Aqrl release,
                Register result, bool result_as_bool = false);
   void cmpxchg_weak(Register addr, Register expected,
                     Register new_val,
                     enum operand_size size,
                     Assembler::Aqrl acquire, Assembler::Aqrl release,
                     Register result);
   void cmpxchg_narrow_value_helper(Register addr, Register expected,
                                    Register new_val,
                                    enum operand_size size,
                                    Register tmp1, Register tmp2, Register tmp3);
   void cmpxchg_narrow_value(Register addr, Register expected,
                             Register new_val,
                             enum operand_size size,
                             Assembler::Aqrl acquire, Assembler::Aqrl release,
                             Register result, bool result_as_bool,
                             Register tmp1, Register tmp2, Register tmp3);
   void weak_cmpxchg_narrow_value(Register addr, Register expected,
                                  Register new_val,
                                  enum operand_size size,
                                  Assembler::Aqrl acquire, Assembler::Aqrl release,
                                  Register result,
                                  Register tmp1, Register tmp2, Register tmp3);

   void atomic_add(Register prev, RegisterOrConstant incr, Register addr);
   void atomic_addw(Register prev, RegisterOrConstant incr, Register addr);
   void atomic_addal(Register prev, RegisterOrConstant incr, Register addr);
   void atomic_addalw(Register prev, RegisterOrConstant incr, Register addr);

   void atomic_xchg(Register prev, Register newv, Register addr);
   void atomic_xchgw(Register prev, Register newv, Register addr);
   void atomic_xchgal(Register prev, Register newv, Register addr);
   void atomic_xchgalw(Register prev, Register newv, Register addr);
   void atomic_xchgwu(Register prev, Register newv, Register addr);
   void atomic_xchgalwu(Register prev, Register newv, Register addr);

   static bool far_branches() {
     return ReservedCodeCacheSize > branch_range;
   }

   // Emit a direct call/jump if the entry address will always be in range,
   // otherwise a far call/jump.
   // The address must be inside the code cache.
   // Supported entry.rspec():
   // - relocInfo::external_word_type
   // - relocInfo::runtime_call_type
   // - relocInfo::none
   // In the case of a far call/jump, the entry address is put in the tmp register.
   // The tmp register is invalidated.
   void far_call(Address entry, Register tmp = t0);
   void far_jump(Address entry, Register tmp = t0);

   static int far_branch_size() {
     if (far_branches()) {
       return 2 * 4;  // auipc + jalr, see far_call() & far_jump()
     } else {
       return 4;
     }
   }

   void load_byte_map_base(Register reg);

   void bang_stack_with_offset(int offset) {
     // stack grows down, caller passes positive offset
     assert(offset > 0, "must bang with negative offset");
     sub(t0, sp, offset);
     sd(zr, Address(t0));
   }

   void la_patchable(Register reg1, const Address &dest, int32_t &offset);

   virtual void _call_Unimplemented(address call_site) {
     mv(t1, call_site);
   }

   #define call_Unimplemented() _call_Unimplemented((address)__PRETTY_FUNCTION__)

   // Frame creation and destruction shared between JITs.
   void build_frame(int framesize);
   void remove_frame(int framesize);

   void reserved_stack_check();

   void get_polling_page(Register dest, relocInfo::relocType rtype);
   void read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype);

   // RISCV64 OpenJDK uses four different types of calls:
   //   - direct call: jal pc_relative_offset
   //     This is the shortest and the fastest, but the offset has the range: +/-1MB.
   //
   //   - far call: auipc reg, pc_relative_offset; jalr ra, reg, offset
   //     This is longer than a direct call. The offset has
   //     the range [-(2G + 2K), 2G - 2K). Addresses out of the range in the code cache
   //     requires indirect call.
   //     If a jump is needed rather than a call, a far jump 'jalr x0, reg, offset' can
   //     be used instead.
   //     All instructions are embedded at a call site.
   //
   //   - trampoline call:
   //     This is only available in C1/C2-generated code (nmethod). It is a combination
   //     of a direct call, which is used if the destination of a call is in range,
   //     and a register-indirect call. It has the advantages of reaching anywhere in
   //     the RISCV address space and being patchable at runtime when the generated
   //     code is being executed by other threads.
   //
   //     [Main code section]
   //       jal trampoline
   //     [Stub code section]
   //     trampoline:
   //       ld    reg, pc + 8 (auipc + ld)
   //       jr    reg
   //       <64-bit destination address>
   //
   //     If the destination is in range when the generated code is moved to the code
   //     cache, 'jal trampoline' is replaced with 'jal destination' and the trampoline
   //     is not used.
   //     The optimization does not remove the trampoline from the stub section.

   //     This is necessary because the trampoline may well be redirected later when
   //     code is patched, and the new destination may not be reachable by a simple JAL
   //     instruction.
   //
   //   - indirect call: movptr + jalr
   //     This too can reach anywhere in the address space, but it cannot be
   //     patched while code is running, so it must only be modified at a safepoint.
   //     This form of call is most suitable for targets at fixed addresses, which
   //     will never be patched.
   //
   //
   // To patch a trampoline call when the JAL can't reach, we first modify
   // the 64-bit destination address in the trampoline, then modify the
   // JAL to point to the trampoline, then flush the instruction cache to
   // broadcast the change to all executing threads. See
   // NativeCall::set_destination_mt_safe for the details.
   //
   // There is a benign race in that the other thread might observe the
   // modified JAL before it observes the modified 64-bit destination
   // address. That does not matter because the destination method has been
   // invalidated, so there will be a trap at its start.
   // For this to work, the destination address in the trampoline is
   // always updated, even if we're not using the trampoline.

   // Emit a direct call if the entry address will always be in range,
   // otherwise a trampoline call.
   // Supported entry.rspec():
   // - relocInfo::runtime_call_type
   // - relocInfo::opt_virtual_call_type
   // - relocInfo::static_call_type
   // - relocInfo::virtual_call_type
   //
   // Return: the call PC or null if CodeCache is full.
   address trampoline_call(Address entry);
   address ic_call(address entry, jint method_index = 0);

   // Support for memory inc/dec
   // n.b. increment/decrement calls with an Address destination will
   // need to use a scratch register to load the value to be
   // incremented. increment/decrement calls which add or subtract a
   // constant value other than sign-extended 12-bit immediate will need
   // to use a 2nd scratch register to hold the constant. so, an address
   // increment/decrement may trash both t0 and t1.

   void increment(const Address dst, int64_t value = 1, Register tmp1 = t0, Register tmp2 = t1);
   void incrementw(const Address dst, int32_t value = 1, Register tmp1 = t0, Register tmp2 = t1);

   void decrement(const Address dst, int64_t value = 1, Register tmp1 = t0, Register tmp2 = t1);
   void decrementw(const Address dst, int32_t value = 1, Register tmp1 = t0, Register tmp2 = t1);

   void cmpptr(Register src1, Address src2, Label& equal);

   void clinit_barrier(Register klass, Register tmp, Label* L_fast_path = nullptr, Label* L_slow_path = nullptr);
   void load_method_holder_cld(Register result, Register method);
   void load_method_holder(Register holder, Register method);

   void compute_index(Register str1, Register trailing_zeros, Register match_mask,
                      Register result, Register char_tmp, Register tmp,
                      bool haystack_isL);
   void compute_match_mask(Register src, Register pattern, Register match_mask,
                           Register mask1, Register mask2);

 #ifdef COMPILER2
   void mul_add(Register out, Register in, Register offset,
                Register len, Register k, Register tmp);
   void cad(Register dst, Register src1, Register src2, Register carry);
   void cadc(Register dst, Register src1, Register src2, Register carry);
   void adc(Register dst, Register src1, Register src2, Register carry);
   void add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
                        Register src1, Register src2, Register carry);
   void multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
                              Register y, Register y_idx, Register z,
                              Register carry, Register product,
                              Register idx, Register kdx);
   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
                              Register y, Register y_idx, Register z,
                              Register carry, Register product,
                              Register idx, Register kdx);
   void multiply_128_x_128_loop(Register y, Register z,
                                Register carry, Register carry2,
                                Register idx, Register jdx,
                                Register yz_idx1, Register yz_idx2,
                                Register tmp, Register tmp3, Register tmp4,
                                Register tmp6, Register product_hi);
   void multiply_to_len(Register x, Register xlen, Register y, Register ylen,
                        Register z, Register zlen,
                        Register tmp1, Register tmp2, Register tmp3, Register tmp4,
                        Register tmp5, Register tmp6, Register product_hi);
 #endif

   void inflate_lo32(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2 = t1);
   void inflate_hi32(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2 = t1);

   void ctzc_bit(Register Rd, Register Rs, bool isLL = false, Register tmp1 = t0, Register tmp2 = t1);

   void zero_words(Register base, uint64_t cnt);
   address zero_words(Register ptr, Register cnt);
   void fill_words(Register base, Register cnt, Register value);
   void zero_memory(Register addr, Register len, Register tmp);
   void zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2);

   // shift left by shamt and add
   void shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt);

   // test single bit in Rs, result is set to Rd
   void test_bit(Register Rd, Register Rs, uint32_t bit_pos);

   // Here the float instructions with safe deal with some exceptions.
   // e.g. convert from NaN, +Inf, -Inf to int, float, double
   // will trigger exception, we need to deal with these situations
   // to get correct results.
   void fcvt_w_s_safe(Register dst, FloatRegister src, Register tmp = t0);
   void fcvt_l_s_safe(Register dst, FloatRegister src, Register tmp = t0);
   void fcvt_w_d_safe(Register dst, FloatRegister src, Register tmp = t0);
   void fcvt_l_d_safe(Register dst, FloatRegister src, Register tmp = t0);

   void java_round_float(Register dst, FloatRegister src, FloatRegister ftmp);
   void java_round_double(Register dst, FloatRegister src, FloatRegister ftmp);

   // vector load/store unit-stride instructions
   void vlex_v(VectorRegister vd, Register base, Assembler::SEW sew, VectorMask vm = unmasked) {
     switch (sew) {
       case Assembler::e64:
         vle64_v(vd, base, vm);
         break;
       case Assembler::e32:
         vle32_v(vd, base, vm);
         break;
       case Assembler::e16:
         vle16_v(vd, base, vm);
         break;
       case Assembler::e8: // fall through
       default:
         vle8_v(vd, base, vm);
         break;
     }
   }

   void vsex_v(VectorRegister store_data, Register base, Assembler::SEW sew, VectorMask vm = unmasked) {
     switch (sew) {
       case Assembler::e64:
         vse64_v(store_data, base, vm);
         break;
       case Assembler::e32:
         vse32_v(store_data, base, vm);
         break;
       case Assembler::e16:
         vse16_v(store_data, base, vm);
         break;
       case Assembler::e8: // fall through
       default:
         vse8_v(store_data, base, vm);
         break;
     }
   }

   // vector pseudo instructions
   inline void vl1r_v(VectorRegister vd, Register rs) {
     vl1re8_v(vd, rs);
   }

   inline void vmnot_m(VectorRegister vd, VectorRegister vs) {
     vmnand_mm(vd, vs, vs);
   }

   inline void vncvt_x_x_w(VectorRegister vd, VectorRegister vs, VectorMask vm = unmasked) {
     vnsrl_wx(vd, vs, x0, vm);
   }

   inline void vneg_v(VectorRegister vd, VectorRegister vs, VectorMask vm = unmasked) {
     vrsub_vx(vd, vs, x0, vm);
   }

   inline void vfneg_v(VectorRegister vd, VectorRegister vs, VectorMask vm = unmasked) {
     vfsgnjn_vv(vd, vs, vs, vm);
   }

   inline void vfabs_v(VectorRegister vd, VectorRegister vs, VectorMask vm = unmasked) {
     vfsgnjx_vv(vd, vs, vs, vm);
   }

   inline void vmsgt_vv(VectorRegister vd, VectorRegister vs2, VectorRegister vs1, VectorMask vm = unmasked) {
     vmslt_vv(vd, vs1, vs2, vm);
   }

   inline void vmsgtu_vv(VectorRegister vd, VectorRegister vs2, VectorRegister vs1, VectorMask vm = unmasked) {
     vmsltu_vv(vd, vs1, vs2, vm);
   }

   inline void vmsge_vv(VectorRegister vd, VectorRegister vs2, VectorRegister vs1, VectorMask vm = unmasked) {
     vmsle_vv(vd, vs1, vs2, vm);
   }

   inline void vmsgeu_vv(VectorRegister vd, VectorRegister vs2, VectorRegister vs1, VectorMask vm = unmasked) {
     vmsleu_vv(vd, vs1, vs2, vm);
   }

   inline void vmfgt_vv(VectorRegister vd, VectorRegister vs2, VectorRegister vs1, VectorMask vm = unmasked) {
     vmflt_vv(vd, vs1, vs2, vm);
   }

   inline void vmfge_vv(VectorRegister vd, VectorRegister vs2, VectorRegister vs1, VectorMask vm = unmasked) {
     vmfle_vv(vd, vs1, vs2, vm);
   }

   // Copy mask register
   inline void vmmv_m(VectorRegister vd, VectorRegister vs) {
     vmand_mm(vd, vs, vs);
   }

   // Clear mask register
   inline void vmclr_m(VectorRegister vd) {
     vmxor_mm(vd, vd, vd);
   }

   // Set mask register
   inline void vmset_m(VectorRegister vd) {
     vmxnor_mm(vd, vd, vd);
   }

   static const int zero_words_block_size;

   void cast_primitive_type(BasicType type, Register Rt) {
     switch (type) {
       case T_BOOLEAN:
         sltu(Rt, zr, Rt);
         break;
       case T_CHAR   :
         zero_extend(Rt, Rt, 16);
         break;
       case T_BYTE   :
         sign_extend(Rt, Rt, 8);
         break;
       case T_SHORT  :
         sign_extend(Rt, Rt, 16);
         break;
       case T_INT    :
         sign_extend(Rt, Rt, 32);
         break;
       case T_LONG   : /* nothing to do */        break;
       case T_VOID   : /* nothing to do */        break;
       case T_FLOAT  : /* nothing to do */        break;
       case T_DOUBLE : /* nothing to do */        break;
       default: ShouldNotReachHere();
     }
   }

   // float cmp with unordered_result
   void float_compare(Register result, FloatRegister Rs1, FloatRegister Rs2, int unordered_result);
   void double_compare(Register result, FloatRegister Rs1, FloatRegister Rs2, int unordered_result);

   // Zero/Sign-extend
   void zero_extend(Register dst, Register src, int bits);
   void sign_extend(Register dst, Register src, int bits);

   // compare src1 and src2 and get -1/0/1 in dst.
   // if [src1 > src2], dst = 1;
   // if [src1 == src2], dst = 0;
   // if [src1 < src2], dst = -1;
   void cmp_l2i(Register dst, Register src1, Register src2, Register tmp = t0);

   // support for argument shuffling
   void move32_64(VMRegPair src, VMRegPair dst, Register tmp = t0);
   void float_move(VMRegPair src, VMRegPair dst, Register tmp = t0);
   void long_move(VMRegPair src, VMRegPair dst, Register tmp = t0);
   void double_move(VMRegPair src, VMRegPair dst, Register tmp = t0);
   void object_move(OopMap* map,
                    int oop_handle_offset,
                    int framesize_in_slots,
                    VMRegPair src,
                    VMRegPair dst,
                    bool is_receiver,
                    int* receiver_offset);
   void rt_call(address dest, Register tmp = t0);

   void call(const address dest, Register temp = t0) {
     assert_cond(dest != nullptr);
     assert(temp != noreg, "expecting a register");
     int32_t offset = 0;
     mv(temp, dest, offset);
     jalr(x1, temp, offset);
   }

   inline void ret() {
     jalr(x0, x1, 0);
   }

 #ifdef ASSERT
   // Template short-hand support to clean-up after a failed call to trampoline
   // call generation (see trampoline_call() below), when a set of Labels must
   // be reset (before returning).
   template<typename Label, typename... More>
   void reset_labels(Label& lbl, More&... more) {
     lbl.reset(); reset_labels(more...);
   }
   template<typename Label>
   void reset_labels(Label& lbl) {
     lbl.reset();
   }
 #endif

 private:

   void repne_scan(Register addr, Register value, Register count, Register tmp);

   void ld_constant(Register dest, const Address &const_addr) {
     if (NearCpool) {
       ld(dest, const_addr);
     } else {
       InternalAddress target(const_addr.target());
       relocate(target.rspec(), [&] {
         int32_t offset;
         la_patchable(dest, target, offset);
         ld(dest, Address(dest, offset));
       });
     }
   }

   int bitset_to_regs(unsigned int bitset, unsigned char* regs);
   Address add_memory_helper(const Address dst, Register tmp);

   void load_reserved(Register addr, enum operand_size size, Assembler::Aqrl acquire);
   void store_conditional(Register addr, Register new_val, enum operand_size size, Assembler::Aqrl release);

 public:
   void lightweight_lock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow);
   void lightweight_unlock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow);
 };

 #ifdef ASSERT
 inline bool AbstractAssembler::pd_check_instruction_mark() { return false; }
 #endif

 /**
  * class SkipIfEqual:
  *
  * Instantiating this class will result in assembly code being output that will
  * jump around any code emitted between the creation of the instance and it's
  * automatic destruction at the end of a scope block, depending on the value of
  * the flag passed to the constructor, which will be checked at run-time.
  */
 class SkipIfEqual {
  private:
   MacroAssembler* _masm;
   Label _label;

  public:
    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value);
    ~SkipIfEqual();
 };

 #endif // CPU_RISCV_MACROASSEMBLER_RISCV_HPP