src/hotspot/cpu/x86/matcher_x86.hpp - toolchain/jdk/jdk21 - Git at Google

 /*
  * Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  *
  */

 #ifndef CPU_X86_MATCHER_X86_HPP
 #define CPU_X86_MATCHER_X86_HPP

   // Defined within class Matcher

   // The ecx parameter to rep stosq for the ClearArray node is in words.
   static const bool init_array_count_is_in_bytes = false;

   // Whether this platform implements the scalable vector feature
   static const bool implements_scalable_vector = false;

   static constexpr bool supports_scalable_vector() {
     return false;
   }

   // x86 supports misaligned vectors store/load.
   static constexpr bool misaligned_vectors_ok() {
     return true;
   }

   // Whether code generation need accurate ConvI2L types.
   static const bool convi2l_type_required = true;

   // Do the processor's shift instructions only use the low 5/6 bits
   // of the count for 32/64 bit ints? If not we need to do the masking
   // ourselves.
   static const bool need_masked_shift_count = false;

   // Does the CPU require late expand (see block.cpp for description of late expand)?
   static const bool require_postalloc_expand = false;

   // x86 supports generic vector operands: vec and legVec.
   static const bool supports_generic_vector_operands = true;

   static constexpr bool isSimpleConstant64(jlong value) {
     // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
     //return value == (int) value;  // Cf. storeImmL and immL32.

     // Probably always true, even if a temp register is required.
 #ifdef _LP64
     return true;
 #else
     return false;
 #endif
   }

 #ifdef _LP64
   // No additional cost for CMOVL.
   static constexpr int long_cmove_cost() { return 0; }
 #else
   // Needs 2 CMOV's for longs.
   static constexpr int long_cmove_cost() { return 1; }
 #endif

 #ifdef _LP64
   // No CMOVF/CMOVD with SSE2
   static int float_cmove_cost() { return ConditionalMoveLimit; }
 #else
   // No CMOVF/CMOVD with SSE/SSE2
   static int float_cmove_cost() { return (UseSSE>=1) ? ConditionalMoveLimit : 0; }
 #endif

   static bool narrow_oop_use_complex_address() {
     NOT_LP64(ShouldNotCallThis();)
     assert(UseCompressedOops, "only for compressed oops code");
     return (LogMinObjAlignmentInBytes <= 3);
   }

   static bool narrow_klass_use_complex_address() {
     NOT_LP64(ShouldNotCallThis();)
     assert(UseCompressedClassPointers, "only for compressed klass code");
     return (LogKlassAlignmentInBytes <= 3);
   }

   // Prefer ConN+DecodeN over ConP.
   static const bool const_oop_prefer_decode() {
     NOT_LP64(ShouldNotCallThis();)
     // Prefer ConN+DecodeN over ConP.
     return true;
   }

   // Prefer ConP over ConNKlass+DecodeNKlass.
   static const bool const_klass_prefer_decode() {
     NOT_LP64(ShouldNotCallThis();)
     return false;
   }

   // Is it better to copy float constants, or load them directly from memory?
   // Intel can load a float constant from a direct address, requiring no
   // extra registers.  Most RISCs will have to materialize an address into a
   // register first, so they would do better to copy the constant from stack.
   static const bool rematerialize_float_constants = true;

   // If CPU can load and store mis-aligned doubles directly then no fixup is
   // needed.  Else we split the double into 2 integer pieces and move it
   // piece-by-piece.  Only happens when passing doubles into C code as the
   // Java calling convention forces doubles to be aligned.
   static const bool misaligned_doubles_ok = true;

   // Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
 #ifdef _LP64
   static const bool strict_fp_requires_explicit_rounding = false;
 #else
   static const bool strict_fp_requires_explicit_rounding = true;
 #endif

   // Are floats converted to double when stored to stack during deoptimization?
   // On x64 it is stored without conversion so we can use normal access.
   // On x32 it is stored with conversion only when FPU is used for floats.
 #ifdef _LP64
   static constexpr bool float_in_double() {
     return false;
   }
 #else
   static bool float_in_double() {
     return (UseSSE == 0);
   }
 #endif

   // Do ints take an entire long register or just half?
 #ifdef _LP64
   static const bool int_in_long = true;
 #else
   static const bool int_in_long = false;
 #endif


   // Does the CPU supports vector variable shift instructions?
   static bool supports_vector_variable_shifts(void) {
     return (UseAVX >= 2);
   }

   // Does the CPU supports vector variable rotate instructions?
   static constexpr bool supports_vector_variable_rotates(void) {
     return true;
   }

   // Does the CPU supports vector constant rotate instructions?
   static constexpr bool supports_vector_constant_rotates(int shift) {
     return -0x80 <= shift && shift < 0x80;
   }

   // Does the CPU supports vector unsigned comparison instructions?
   static const bool supports_vector_comparison_unsigned(int vlen, BasicType bt) {
     return true;
   }

   // Some microarchitectures have mask registers used on vectors
   static const bool has_predicated_vectors(void) {
     return VM_Version::supports_evex();
   }

   // true means we have fast l2f conversion
   // false means that conversion is done by runtime call
   static constexpr bool convL2FSupported(void) {
       return true;
   }

   // Implements a variant of EncodeISOArrayNode that encode ASCII only
   static const bool supports_encode_ascii_array = true;

   // Without predicated input, an all-one vector is needed for the alltrue vector test
   static constexpr bool vectortest_needs_second_argument(bool is_alltrue, bool is_predicate) {
     return is_alltrue && !is_predicate;
   }

   // BoolTest mask for vector test intrinsics
   static constexpr BoolTest::mask vectortest_mask(bool is_alltrue, bool is_predicate, int vlen) {
     if (!is_alltrue) {
       return BoolTest::ne;
     }
     if (!is_predicate) {
       return BoolTest::lt;
     }
     if ((vlen == 8 && !VM_Version::supports_avx512dq()) || vlen < 8) {
       return BoolTest::eq;
     }
     return BoolTest::lt;
   }

   // Returns pre-selection estimated size of a vector operation.
   // Currently, it's a rudimentary heuristic based on emitted code size for complex
   // IR nodes used by unroll policy. Idea is to constrain unrolling factor and prevent
   // generating bloated loop bodies.
   static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
     switch(vopc) {
       default:
         return 0;
       case Op_MulVB:
         return 7;
       case Op_MulVL:
         return VM_Version::supports_avx512vldq() ? 0 : 6;
       case Op_VectorCastF2X: // fall through
       case Op_VectorCastD2X:
         return is_floating_point_type(ety) ? 0 : (is_subword_type(ety) ? 35 : 30);
       case Op_CountTrailingZerosV:
       case Op_CountLeadingZerosV:
         return VM_Version::supports_avx512cd() && (ety == T_INT || ety == T_LONG) ? 0 : 40;
       case Op_PopCountVI:
         if (is_subword_type(ety)) {
           return VM_Version::supports_avx512_bitalg() ? 0 : 50;
         } else {
           assert(ety == T_INT, "sanity"); // for documentation purposes
           return VM_Version::supports_avx512_vpopcntdq() ? 0 : 50;
         }
       case Op_PopCountVL:
         return VM_Version::supports_avx512_vpopcntdq() ? 0 : 40;
       case Op_ReverseV:
         return VM_Version::supports_gfni() ? 0 : 30;
       case Op_RoundVF: // fall through
       case Op_RoundVD:
         return 30;
     }
   }

   // Returns pre-selection estimated size of a scalar operation.
   static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
     switch(vopc) {
       default: return 0;
       case Op_RoundF: // fall through
       case Op_RoundD: {
         return 30;
       }
     }
   }

 #endif // CPU_X86_MATCHER_X86_HPP
	/*
	* Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved.
	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
	*
	* This code is free software; you can redistribute it and/or modify it
	* under the terms of the GNU General Public License version 2 only, as
	* published by the Free Software Foundation.
	*
	* This code is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	* version 2 for more details (a copy is included in the LICENSE file that
	* accompanied this code).
	*
	* You should have received a copy of the GNU General Public License version
	* 2 along with this work; if not, write to the Free Software Foundation,
	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	*
	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
	* or visit www.oracle.com if you need additional information or have any
	* questions.
	*
	*/

	#ifndef CPU_X86_MATCHER_X86_HPP
	#define CPU_X86_MATCHER_X86_HPP

	// Defined within class Matcher

	// The ecx parameter to rep stosq for the ClearArray node is in words.
	static const bool init_array_count_is_in_bytes = false;

	// Whether this platform implements the scalable vector feature
	static const bool implements_scalable_vector = false;

	static constexpr bool supports_scalable_vector() {
	return false;
	}

	// x86 supports misaligned vectors store/load.
	static constexpr bool misaligned_vectors_ok() {
	return true;
	}

	// Whether code generation need accurate ConvI2L types.
	static const bool convi2l_type_required = true;

	// Do the processor's shift instructions only use the low 5/6 bits
	// of the count for 32/64 bit ints? If not we need to do the masking
	// ourselves.
	static const bool need_masked_shift_count = false;

	// Does the CPU require late expand (see block.cpp for description of late expand)?
	static const bool require_postalloc_expand = false;

	// x86 supports generic vector operands: vec and legVec.
	static const bool supports_generic_vector_operands = true;

	static constexpr bool isSimpleConstant64(jlong value) {
	// Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
	//return value == (int) value; // Cf. storeImmL and immL32.

	// Probably always true, even if a temp register is required.
	#ifdef _LP64
	return true;
	#else
	return false;
	#endif
	}

	#ifdef _LP64
	// No additional cost for CMOVL.
	static constexpr int long_cmove_cost() { return 0; }
	#else
	// Needs 2 CMOV's for longs.
	static constexpr int long_cmove_cost() { return 1; }
	#endif

	#ifdef _LP64
	// No CMOVF/CMOVD with SSE2
	static int float_cmove_cost() { return ConditionalMoveLimit; }
	#else
	// No CMOVF/CMOVD with SSE/SSE2
	static int float_cmove_cost() { return (UseSSE>=1) ? ConditionalMoveLimit : 0; }
	#endif

	static bool narrow_oop_use_complex_address() {
	NOT_LP64(ShouldNotCallThis();)
	assert(UseCompressedOops, "only for compressed oops code");
	return (LogMinObjAlignmentInBytes <= 3);
	}

	static bool narrow_klass_use_complex_address() {
	NOT_LP64(ShouldNotCallThis();)
	assert(UseCompressedClassPointers, "only for compressed klass code");
	return (LogKlassAlignmentInBytes <= 3);
	}

	// Prefer ConN+DecodeN over ConP.
	static const bool const_oop_prefer_decode() {
	NOT_LP64(ShouldNotCallThis();)
	// Prefer ConN+DecodeN over ConP.
	return true;
	}

	// Prefer ConP over ConNKlass+DecodeNKlass.
	static const bool const_klass_prefer_decode() {
	NOT_LP64(ShouldNotCallThis();)
	return false;
	}

	// Is it better to copy float constants, or load them directly from memory?
	// Intel can load a float constant from a direct address, requiring no
	// extra registers. Most RISCs will have to materialize an address into a
	// register first, so they would do better to copy the constant from stack.
	static const bool rematerialize_float_constants = true;

	// If CPU can load and store mis-aligned doubles directly then no fixup is
	// needed. Else we split the double into 2 integer pieces and move it
	// piece-by-piece. Only happens when passing doubles into C code as the
	// Java calling convention forces doubles to be aligned.
	static const bool misaligned_doubles_ok = true;

	// Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
	#ifdef _LP64
	static const bool strict_fp_requires_explicit_rounding = false;
	#else
	static const bool strict_fp_requires_explicit_rounding = true;
	#endif

	// Are floats converted to double when stored to stack during deoptimization?
	// On x64 it is stored without conversion so we can use normal access.
	// On x32 it is stored with conversion only when FPU is used for floats.
	#ifdef _LP64
	static constexpr bool float_in_double() {
	return false;
	}
	#else
	static bool float_in_double() {
	return (UseSSE == 0);
	}
	#endif

	// Do ints take an entire long register or just half?
	#ifdef _LP64
	static const bool int_in_long = true;
	#else
	static const bool int_in_long = false;
	#endif


	// Does the CPU supports vector variable shift instructions?
	static bool supports_vector_variable_shifts(void) {
	return (UseAVX >= 2);
	}

	// Does the CPU supports vector variable rotate instructions?
	static constexpr bool supports_vector_variable_rotates(void) {
	return true;
	}

	// Does the CPU supports vector constant rotate instructions?
	static constexpr bool supports_vector_constant_rotates(int shift) {
	return -0x80 <= shift && shift < 0x80;
	}

	// Does the CPU supports vector unsigned comparison instructions?
	static const bool supports_vector_comparison_unsigned(int vlen, BasicType bt) {
	return true;
	}

	// Some microarchitectures have mask registers used on vectors
	static const bool has_predicated_vectors(void) {
	return VM_Version::supports_evex();
	}

	// true means we have fast l2f conversion
	// false means that conversion is done by runtime call
	static constexpr bool convL2FSupported(void) {
	return true;
	}

	// Implements a variant of EncodeISOArrayNode that encode ASCII only
	static const bool supports_encode_ascii_array = true;

	// Without predicated input, an all-one vector is needed for the alltrue vector test
	static constexpr bool vectortest_needs_second_argument(bool is_alltrue, bool is_predicate) {
	return is_alltrue && !is_predicate;
	}

	// BoolTest mask for vector test intrinsics
	static constexpr BoolTest::mask vectortest_mask(bool is_alltrue, bool is_predicate, int vlen) {
	if (!is_alltrue) {
	return BoolTest::ne;
	}
	if (!is_predicate) {
	return BoolTest::lt;
	}
	if ((vlen == 8 && !VM_Version::supports_avx512dq()) \|\| vlen < 8) {
	return BoolTest::eq;
	}
	return BoolTest::lt;
	}

	// Returns pre-selection estimated size of a vector operation.
	// Currently, it's a rudimentary heuristic based on emitted code size for complex
	// IR nodes used by unroll policy. Idea is to constrain unrolling factor and prevent
	// generating bloated loop bodies.
	static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
	switch(vopc) {
	default:
	return 0;
	case Op_MulVB:
	return 7;
	case Op_MulVL:
	return VM_Version::supports_avx512vldq() ? 0 : 6;
	case Op_VectorCastF2X: // fall through
	case Op_VectorCastD2X:
	return is_floating_point_type(ety) ? 0 : (is_subword_type(ety) ? 35 : 30);
	case Op_CountTrailingZerosV:
	case Op_CountLeadingZerosV:
	return VM_Version::supports_avx512cd() && (ety == T_INT \|\| ety == T_LONG) ? 0 : 40;
	case Op_PopCountVI:
	if (is_subword_type(ety)) {
	return VM_Version::supports_avx512_bitalg() ? 0 : 50;
	} else {
	assert(ety == T_INT, "sanity"); // for documentation purposes
	return VM_Version::supports_avx512_vpopcntdq() ? 0 : 50;
	}
	case Op_PopCountVL:
	return VM_Version::supports_avx512_vpopcntdq() ? 0 : 40;
	case Op_ReverseV:
	return VM_Version::supports_gfni() ? 0 : 30;
	case Op_RoundVF: // fall through
	case Op_RoundVD:
	return 30;
	}
	}

	// Returns pre-selection estimated size of a scalar operation.
	static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
	switch(vopc) {
	default: return 0;
	case Op_RoundF: // fall through
	case Op_RoundD: {
	return 30;
	}
	}
	}

	#endif // CPU_X86_MATCHER_X86_HPP