blob: cdf3ec7567ac9f79cba8fe17444e7126f4f67b0f [file] [log] [blame]
/*
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "opto/c2_MacroAssembler.hpp"
#include "opto/compile.hpp"
#include "opto/intrinsicnode.hpp"
#include "opto/matcher.hpp"
#include "opto/output.hpp"
#include "opto/subnode.hpp"
#include "runtime/stubRoutines.hpp"
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#define STOP(error) stop(error)
#else
#define BLOCK_COMMENT(str) block_comment(str)
#define STOP(error) block_comment(error); stop(error)
#endif
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
Register tmp2Reg, Register tmp3Reg) {
Register oop = objectReg;
Register box = boxReg;
Register disp_hdr = tmpReg;
Register tmp = tmp2Reg;
Label cont;
Label object_has_monitor;
Label count, no_count;
assert_different_registers(oop, box, tmp, disp_hdr);
// Load markWord from object into displaced_header.
ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
if (DiagnoseSyncOnValueBasedClasses != 0) {
load_klass(tmp, oop);
ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
br(Assembler::NE, cont);
}
// Check for existing monitor
tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
if (LockingMode == LM_MONITOR) {
tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
b(cont);
} else if (LockingMode == LM_LEGACY) {
// Set tmp to be (markWord of object | UNLOCK_VALUE).
orr(tmp, disp_hdr, markWord::unlocked_value);
// Initialize the box. (Must happen before we update the object mark!)
str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
// Compare object markWord with an unlocked value (tmp) and if
// equal exchange the stack address of our box with object markWord.
// On failure disp_hdr contains the possibly locked markWord.
cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
/*release*/ true, /*weak*/ false, disp_hdr);
br(Assembler::EQ, cont);
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
// If the compare-and-exchange succeeded, then we found an unlocked
// object, will have now locked it will continue at label cont
// Check if the owner is self by comparing the value in the
// markWord of object (disp_hdr) with the stack pointer.
mov(rscratch1, sp);
sub(disp_hdr, disp_hdr, rscratch1);
mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
// If condition is true we are cont and hence we can store 0 as the
// displaced header in the box, which indicates that it is a recursive lock.
ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result
str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
b(cont);
} else {
assert(LockingMode == LM_LIGHTWEIGHT, "must be");
lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, no_count);
b(count);
}
// Handle existing monitor.
bind(object_has_monitor);
// The object's monitor m is unlocked iff m->owner == NULL,
// otherwise m->owner may contain a thread or a stack address.
//
// Try to CAS m->owner from NULL to current thread.
add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
/*release*/ true, /*weak*/ false, rscratch1); // Sets flags for result
if (LockingMode != LM_LIGHTWEIGHT) {
// Store a non-null value into the box to avoid looking like a re-entrant
// lock. The fast-path monitor unlock code checks for
// markWord::monitor_value so use markWord::unused_mark which has the
// relevant bit set, and also matches ObjectSynchronizer::enter.
mov(tmp, (address)markWord::unused_mark().value());
str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
}
br(Assembler::EQ, cont); // CAS success means locking succeeded
cmp(rscratch1, rthread);
br(Assembler::NE, cont); // Check for recursive locking
// Recursive lock case
increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
// flag == EQ still from the cmp above, checking if this is a reentrant lock
bind(cont);
// flag == EQ indicates success
// flag == NE indicates failure
br(Assembler::NE, no_count);
bind(count);
increment(Address(rthread, JavaThread::held_monitor_count_offset()));
bind(no_count);
}
void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
Register tmp2Reg) {
Register oop = objectReg;
Register box = boxReg;
Register disp_hdr = tmpReg;
Register tmp = tmp2Reg;
Label cont;
Label object_has_monitor;
Label count, no_count;
assert_different_registers(oop, box, tmp, disp_hdr);
if (LockingMode == LM_LEGACY) {
// Find the lock address and load the displaced header from the stack.
ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
// If the displaced header is 0, we have a recursive unlock.
cmp(disp_hdr, zr);
br(Assembler::EQ, cont);
}
// Handle existing monitor.
ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
if (LockingMode == LM_MONITOR) {
tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
b(cont);
} else if (LockingMode == LM_LEGACY) {
// Check if it is still a light weight lock, this is is true if we
// see the stack address of the basicLock in the markWord of the
// object.
cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
/*release*/ true, /*weak*/ false, tmp);
b(cont);
} else {
assert(LockingMode == LM_LIGHTWEIGHT, "must be");
lightweight_unlock(oop, tmp, box, disp_hdr, no_count);
b(count);
}
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
// Handle existing monitor.
bind(object_has_monitor);
STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
add(tmp, tmp, -(int)markWord::monitor_value); // monitor
if (LockingMode == LM_LIGHTWEIGHT) {
// If the owner is anonymous, we need to fix it -- in an outline stub.
Register tmp2 = disp_hdr;
ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
// We cannot use tbnz here, the target might be too far away and cannot
// be encoded.
tst(tmp2, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER);
C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
Compile::current()->output()->add_stub(stub);
br(Assembler::NE, stub->entry());
bind(stub->continuation());
}
ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
Label notRecursive;
cbz(disp_hdr, notRecursive);
// Recursive lock
sub(disp_hdr, disp_hdr, 1u);
str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
cmp(disp_hdr, disp_hdr); // Sets flags for result
b(cont);
bind(notRecursive);
ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
cmp(rscratch1, zr); // Sets flags for result
cbnz(rscratch1, cont);
// need a release store here
lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
stlr(zr, tmp); // set unowned
bind(cont);
// flag == EQ indicates success
// flag == NE indicates failure
br(Assembler::NE, no_count);
bind(count);
decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
bind(no_count);
}
// Search for str1 in str2 and return index or -1
// Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
void C2_MacroAssembler::string_indexof(Register str2, Register str1,
Register cnt2, Register cnt1,
Register tmp1, Register tmp2,
Register tmp3, Register tmp4,
Register tmp5, Register tmp6,
int icnt1, Register result, int ae) {
// NOTE: tmp5, tmp6 can be zr depending on specific method version
Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
Register ch1 = rscratch1;
Register ch2 = rscratch2;
Register cnt1tmp = tmp1;
Register cnt2tmp = tmp2;
Register cnt1_neg = cnt1;
Register cnt2_neg = cnt2;
Register result_tmp = tmp4;
bool isL = ae == StrIntrinsicNode::LL;
bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
int str1_chr_shift = str1_isL ? 0:1;
int str2_chr_shift = str2_isL ? 0:1;
int str1_chr_size = str1_isL ? 1:2;
int str2_chr_size = str2_isL ? 1:2;
chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
(chr_insn)&MacroAssembler::ldrh;
chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
(chr_insn)&MacroAssembler::ldrh;
chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
// Note, inline_string_indexOf() generates checks:
// if (substr.count > string.count) return -1;
// if (substr.count == 0) return 0;
// We have two strings, a source string in str2, cnt2 and a pattern string
// in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
// For larger pattern and source we use a simplified Boyer Moore algorithm.
// With a small pattern and source we use linear scan.
if (icnt1 == -1) {
sub(result_tmp, cnt2, cnt1);
cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
br(LT, LINEARSEARCH);
dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
subs(zr, cnt1, 256);
lsr(tmp1, cnt2, 2);
ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
br(GE, LINEARSTUB);
}
// The Boyer Moore alogorithm is based on the description here:-
//
// http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
//
// This describes and algorithm with 2 shift rules. The 'Bad Character' rule
// and the 'Good Suffix' rule.
//
// These rules are essentially heuristics for how far we can shift the
// pattern along the search string.
//
// The implementation here uses the 'Bad Character' rule only because of the
// complexity of initialisation for the 'Good Suffix' rule.
//
// This is also known as the Boyer-Moore-Horspool algorithm:-
//
// http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
//
// This particular implementation has few java-specific optimizations.
//
// #define ASIZE 256
//
// int bm(unsigned char *x, int m, unsigned char *y, int n) {
// int i, j;
// unsigned c;
// unsigned char bc[ASIZE];
//
// /* Preprocessing */
// for (i = 0; i < ASIZE; ++i)
// bc[i] = m;
// for (i = 0; i < m - 1; ) {
// c = x[i];
// ++i;
// // c < 256 for Latin1 string, so, no need for branch
// #ifdef PATTERN_STRING_IS_LATIN1
// bc[c] = m - i;
// #else
// if (c < ASIZE) bc[c] = m - i;
// #endif
// }
//
// /* Searching */
// j = 0;
// while (j <= n - m) {
// c = y[i+j];
// if (x[m-1] == c)
// for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
// if (i < 0) return j;
// // c < 256 for Latin1 string, so, no need for branch
// #ifdef SOURCE_STRING_IS_LATIN1
// // LL case: (c< 256) always true. Remove branch
// j += bc[y[j+m-1]];
// #endif
// #ifndef PATTERN_STRING_IS_UTF
// // UU case: need if (c<ASIZE) check. Skip 1 character if not.
// if (c < ASIZE)
// j += bc[y[j+m-1]];
// else
// j += 1
// #endif
// #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
// // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
// if (c < ASIZE)
// j += bc[y[j+m-1]];
// else
// j += m
// #endif
// }
// }
if (icnt1 == -1) {
Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
Register cnt1end = tmp2;
Register str2end = cnt2;
Register skipch = tmp2;
// str1 length is >=8, so, we can read at least 1 register for cases when
// UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
// UL case. We'll re-read last character in inner pre-loop code to have
// single outer pre-loop load
const int firstStep = isL ? 7 : 3;
const int ASIZE = 256;
const int STORED_BYTES = 32; // amount of bytes stored per instruction
sub(sp, sp, ASIZE);
mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
mov(ch1, sp);
BIND(BM_INIT_LOOP);
stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
subs(tmp5, tmp5, 1);
br(GT, BM_INIT_LOOP);
sub(cnt1tmp, cnt1, 1);
mov(tmp5, str2);
add(str2end, str2, result_tmp, LSL, str2_chr_shift);
sub(ch2, cnt1, 1);
mov(tmp3, str1);
BIND(BCLOOP);
(this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
if (!str1_isL) {
subs(zr, ch1, ASIZE);
br(HS, BCSKIP);
}
strb(ch2, Address(sp, ch1));
BIND(BCSKIP);
subs(ch2, ch2, 1);
br(GT, BCLOOP);
add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
if (str1_isL == str2_isL) {
// load last 8 bytes (8LL/4UU symbols)
ldr(tmp6, Address(tmp6, -wordSize));
} else {
ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
// convert Latin1 to UTF. We'll have to wait until load completed, but
// it's still faster than per-character loads+checks
lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
ubfx(ch1, tmp6, 8, 8); // str1[N-2]
ubfx(ch2, tmp6, 16, 8); // str1[N-3]
andr(tmp6, tmp6, 0xFF); // str1[N-4]
orr(ch2, ch1, ch2, LSL, 16);
orr(tmp6, tmp6, tmp3, LSL, 48);
orr(tmp6, tmp6, ch2, LSL, 16);
}
BIND(BMLOOPSTR2);
(this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
if (str1_isL == str2_isL) {
// re-init tmp3. It's for free because it's executed in parallel with
// load above. Alternative is to initialize it before loop, but it'll
// affect performance on in-order systems with 2 or more ld/st pipelines
lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
}
if (!isL) { // UU/UL case
lsl(ch2, cnt1tmp, 1); // offset in bytes
}
cmp(tmp3, skipch);
br(NE, BMSKIP);
ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
mov(ch1, tmp6);
if (isL) {
b(BMLOOPSTR1_AFTER_LOAD);
} else {
sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
b(BMLOOPSTR1_CMP);
}
BIND(BMLOOPSTR1);
(this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
(this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
BIND(BMLOOPSTR1_AFTER_LOAD);
subs(cnt1tmp, cnt1tmp, 1);
br(LT, BMLOOPSTR1_LASTCMP);
BIND(BMLOOPSTR1_CMP);
cmp(ch1, ch2);
br(EQ, BMLOOPSTR1);
BIND(BMSKIP);
if (!isL) {
// if we've met UTF symbol while searching Latin1 pattern, then we can
// skip cnt1 symbols
if (str1_isL != str2_isL) {
mov(result_tmp, cnt1);
} else {
mov(result_tmp, 1);
}
subs(zr, skipch, ASIZE);
br(HS, BMADV);
}
ldrb(result_tmp, Address(sp, skipch)); // load skip distance
BIND(BMADV);
sub(cnt1tmp, cnt1, 1);
add(str2, str2, result_tmp, LSL, str2_chr_shift);
cmp(str2, str2end);
br(LE, BMLOOPSTR2);
add(sp, sp, ASIZE);
b(NOMATCH);
BIND(BMLOOPSTR1_LASTCMP);
cmp(ch1, ch2);
br(NE, BMSKIP);
BIND(BMMATCH);
sub(result, str2, tmp5);
if (!str2_isL) lsr(result, result, 1);
add(sp, sp, ASIZE);
b(DONE);
BIND(LINEARSTUB);
cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
br(LT, LINEAR_MEDIUM);
mov(result, zr);
RuntimeAddress stub = nullptr;
if (isL) {
stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
} else if (str1_isL) {
stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
} else {
stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
}
address call = trampoline_call(stub);
if (call == nullptr) {
DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
ciEnv::current()->record_failure("CodeCache is full");
return;
}
b(DONE);
}
BIND(LINEARSEARCH);
{
Label DO1, DO2, DO3;
Register str2tmp = tmp2;
Register first = tmp3;
if (icnt1 == -1)
{
Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
br(LT, DOSHORT);
BIND(LINEAR_MEDIUM);
(this->*str1_load_1chr)(first, Address(str1));
lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
BIND(FIRST_LOOP);
(this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
cmp(first, ch2);
br(EQ, STR1_LOOP);
BIND(STR2_NEXT);
adds(cnt2_neg, cnt2_neg, str2_chr_size);
br(LE, FIRST_LOOP);
b(NOMATCH);
BIND(STR1_LOOP);
adds(cnt1tmp, cnt1_neg, str1_chr_size);
add(cnt2tmp, cnt2_neg, str2_chr_size);
br(GE, MATCH);
BIND(STR1_NEXT);
(this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
(this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
cmp(ch1, ch2);
br(NE, STR2_NEXT);
adds(cnt1tmp, cnt1tmp, str1_chr_size);
add(cnt2tmp, cnt2tmp, str2_chr_size);
br(LT, STR1_NEXT);
b(MATCH);
BIND(DOSHORT);
if (str1_isL == str2_isL) {
cmp(cnt1, (u1)2);
br(LT, DO1);
br(GT, DO3);
}
}
if (icnt1 == 4) {
Label CH1_LOOP;
(this->*load_4chr)(ch1, str1);
sub(result_tmp, cnt2, 4);
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
BIND(CH1_LOOP);
(this->*load_4chr)(ch2, Address(str2, cnt2_neg));
cmp(ch1, ch2);
br(EQ, MATCH);
adds(cnt2_neg, cnt2_neg, str2_chr_size);
br(LE, CH1_LOOP);
b(NOMATCH);
}
if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
Label CH1_LOOP;
BIND(DO2);
(this->*load_2chr)(ch1, str1);
if (icnt1 == 2) {
sub(result_tmp, cnt2, 2);
}
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
BIND(CH1_LOOP);
(this->*load_2chr)(ch2, Address(str2, cnt2_neg));
cmp(ch1, ch2);
br(EQ, MATCH);
adds(cnt2_neg, cnt2_neg, str2_chr_size);
br(LE, CH1_LOOP);
b(NOMATCH);
}
if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
BIND(DO3);
(this->*load_2chr)(first, str1);
(this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
if (icnt1 == 3) {
sub(result_tmp, cnt2, 3);
}
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
BIND(FIRST_LOOP);
(this->*load_2chr)(ch2, Address(str2, cnt2_neg));
cmpw(first, ch2);
br(EQ, STR1_LOOP);
BIND(STR2_NEXT);
adds(cnt2_neg, cnt2_neg, str2_chr_size);
br(LE, FIRST_LOOP);
b(NOMATCH);
BIND(STR1_LOOP);
add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
(this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
cmp(ch1, ch2);
br(NE, STR2_NEXT);
b(MATCH);
}
if (icnt1 == -1 || icnt1 == 1) {
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
BIND(DO1);
(this->*str1_load_1chr)(ch1, str1);
cmp(cnt2, (u1)8);
br(LT, DO1_SHORT);
sub(result_tmp, cnt2, 8/str2_chr_size);
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
if (str2_isL) {
orr(ch1, ch1, ch1, LSL, 8);
}
orr(ch1, ch1, ch1, LSL, 16);
orr(ch1, ch1, ch1, LSL, 32);
BIND(CH1_LOOP);
ldr(ch2, Address(str2, cnt2_neg));
eor(ch2, ch1, ch2);
sub(tmp1, ch2, tmp3);
orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
bics(tmp1, tmp1, tmp2);
br(NE, HAS_ZERO);
adds(cnt2_neg, cnt2_neg, 8);
br(LT, CH1_LOOP);
cmp(cnt2_neg, (u1)8);
mov(cnt2_neg, 0);
br(LT, CH1_LOOP);
b(NOMATCH);
BIND(HAS_ZERO);
rev(tmp1, tmp1);
clz(tmp1, tmp1);
add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
b(MATCH);
BIND(DO1_SHORT);
mov(result_tmp, cnt2);
lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
BIND(DO1_LOOP);
(this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
cmpw(ch1, ch2);
br(EQ, MATCH);
adds(cnt2_neg, cnt2_neg, str2_chr_size);
br(LT, DO1_LOOP);
}
}
BIND(NOMATCH);
mov(result, -1);
b(DONE);
BIND(MATCH);
add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
BIND(DONE);
}
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
Register ch, Register result,
Register tmp1, Register tmp2, Register tmp3)
{
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
Register cnt1_neg = cnt1;
Register ch1 = rscratch1;
Register result_tmp = rscratch2;
cbz(cnt1, NOMATCH);
cmp(cnt1, (u1)4);
br(LT, DO1_SHORT);
orr(ch, ch, ch, LSL, 16);
orr(ch, ch, ch, LSL, 32);
sub(cnt1, cnt1, 4);
mov(result_tmp, cnt1);
lea(str1, Address(str1, cnt1, Address::uxtw(1)));
sub(cnt1_neg, zr, cnt1, LSL, 1);
mov(tmp3, 0x0001000100010001);
BIND(CH1_LOOP);
ldr(ch1, Address(str1, cnt1_neg));
eor(ch1, ch, ch1);
sub(tmp1, ch1, tmp3);
orr(tmp2, ch1, 0x7fff7fff7fff7fff);
bics(tmp1, tmp1, tmp2);
br(NE, HAS_ZERO);
adds(cnt1_neg, cnt1_neg, 8);
br(LT, CH1_LOOP);
cmp(cnt1_neg, (u1)8);
mov(cnt1_neg, 0);
br(LT, CH1_LOOP);
b(NOMATCH);
BIND(HAS_ZERO);
rev(tmp1, tmp1);
clz(tmp1, tmp1);
add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
b(MATCH);
BIND(DO1_SHORT);
mov(result_tmp, cnt1);
lea(str1, Address(str1, cnt1, Address::uxtw(1)));
sub(cnt1_neg, zr, cnt1, LSL, 1);
BIND(DO1_LOOP);
ldrh(ch1, Address(str1, cnt1_neg));
cmpw(ch, ch1);
br(EQ, MATCH);
adds(cnt1_neg, cnt1_neg, 2);
br(LT, DO1_LOOP);
BIND(NOMATCH);
mov(result, -1);
b(DONE);
BIND(MATCH);
add(result, result_tmp, cnt1_neg, ASR, 1);
BIND(DONE);
}
void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
Register ch, Register result,
FloatRegister ztmp1,
FloatRegister ztmp2,
PRegister tmp_pg,
PRegister tmp_pdn, bool isL)
{
// Note that `tmp_pdn` should *NOT* be used as governing predicate register.
assert(tmp_pg->is_governing(),
"this register has to be a governing predicate register");
Label LOOP, MATCH, DONE, NOMATCH;
Register vec_len = rscratch1;
Register idx = rscratch2;
SIMD_RegVariant T = (isL == true) ? B : H;
cbz(cnt1, NOMATCH);
// Assign the particular char throughout the vector.
sve_dup(ztmp2, T, ch);
if (isL) {
sve_cntb(vec_len);
} else {
sve_cnth(vec_len);
}
mov(idx, 0);
// Generate a predicate to control the reading of input string.
sve_whilelt(tmp_pg, T, idx, cnt1);
BIND(LOOP);
// Read a vector of 8- or 16-bit data depending on the string type. Note
// that inactive elements indicated by the predicate register won't cause
// a data read from memory to the destination vector.
if (isL) {
sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
} else {
sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
}
add(idx, idx, vec_len);
// Perform the comparison. An element of the destination predicate is set
// to active if the particular char is matched.
sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
// Branch if the particular char is found.
br(NE, MATCH);
sve_whilelt(tmp_pg, T, idx, cnt1);
// Loop back if the particular char not found.
br(MI, LOOP);
BIND(NOMATCH);
mov(result, -1);
b(DONE);
BIND(MATCH);
// Undo the index increment.
sub(idx, idx, vec_len);
// Crop the vector to find its location.
sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
add(result, idx, -1);
sve_incp(result, T, tmp_pdn);
BIND(DONE);
}
void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
Register ch, Register result,
Register tmp1, Register tmp2, Register tmp3)
{
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
Register cnt1_neg = cnt1;
Register ch1 = rscratch1;
Register result_tmp = rscratch2;
cbz(cnt1, NOMATCH);
cmp(cnt1, (u1)8);
br(LT, DO1_SHORT);
orr(ch, ch, ch, LSL, 8);
orr(ch, ch, ch, LSL, 16);
orr(ch, ch, ch, LSL, 32);
sub(cnt1, cnt1, 8);
mov(result_tmp, cnt1);
lea(str1, Address(str1, cnt1));
sub(cnt1_neg, zr, cnt1);
mov(tmp3, 0x0101010101010101);
BIND(CH1_LOOP);
ldr(ch1, Address(str1, cnt1_neg));
eor(ch1, ch, ch1);
sub(tmp1, ch1, tmp3);
orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
bics(tmp1, tmp1, tmp2);
br(NE, HAS_ZERO);
adds(cnt1_neg, cnt1_neg, 8);
br(LT, CH1_LOOP);
cmp(cnt1_neg, (u1)8);
mov(cnt1_neg, 0);
br(LT, CH1_LOOP);
b(NOMATCH);
BIND(HAS_ZERO);
rev(tmp1, tmp1);
clz(tmp1, tmp1);
add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
b(MATCH);
BIND(DO1_SHORT);
mov(result_tmp, cnt1);
lea(str1, Address(str1, cnt1));
sub(cnt1_neg, zr, cnt1);
BIND(DO1_LOOP);
ldrb(ch1, Address(str1, cnt1_neg));
cmp(ch, ch1);
br(EQ, MATCH);
adds(cnt1_neg, cnt1_neg, 1);
br(LT, DO1_LOOP);
BIND(NOMATCH);
mov(result, -1);
b(DONE);
BIND(MATCH);
add(result, result_tmp, cnt1_neg);
BIND(DONE);
}
// Compare strings.
void C2_MacroAssembler::string_compare(Register str1, Register str2,
Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
PRegister pgtmp1, PRegister pgtmp2, int ae) {
Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
SHORT_LOOP_START, TAIL_CHECK;
bool isLL = ae == StrIntrinsicNode::LL;
bool isLU = ae == StrIntrinsicNode::LU;
bool isUL = ae == StrIntrinsicNode::UL;
// The stub threshold for LL strings is: 72 (64 + 8) chars
// UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
// LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
bool str1_isL = isLL || isLU;
bool str2_isL = isLL || isUL;
int str1_chr_shift = str1_isL ? 0 : 1;
int str2_chr_shift = str2_isL ? 0 : 1;
int str1_chr_size = str1_isL ? 1 : 2;
int str2_chr_size = str2_isL ? 1 : 2;
int minCharsInWord = isLL ? wordSize : wordSize/2;
FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
(chr_insn)&MacroAssembler::ldrh;
chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
(chr_insn)&MacroAssembler::ldrh;
uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
(uxt_insn)&MacroAssembler::uxthw;
BLOCK_COMMENT("string_compare {");
// Bizzarely, the counts are passed in bytes, regardless of whether they
// are L or U strings, however the result is always in characters.
if (!str1_isL) asrw(cnt1, cnt1, 1);
if (!str2_isL) asrw(cnt2, cnt2, 1);
// Compute the minimum of the string lengths and save the difference.
subsw(result, cnt1, cnt2);
cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
// A very short string
cmpw(cnt2, minCharsInWord);
br(Assembler::LE, SHORT_STRING);
// Compare longwords
// load first parts of strings and finish initialization while loading
{
if (str1_isL == str2_isL) { // LL or UU
ldr(tmp1, Address(str1));
cmp(str1, str2);
br(Assembler::EQ, DONE);
ldr(tmp2, Address(str2));
cmp(cnt2, stub_threshold);
br(GE, STUB);
subsw(cnt2, cnt2, minCharsInWord);
br(EQ, TAIL_CHECK);
lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
} else if (isLU) {
ldrs(vtmp, Address(str1));
ldr(tmp2, Address(str2));
cmp(cnt2, stub_threshold);
br(GE, STUB);
subw(cnt2, cnt2, 4);
eor(vtmpZ, T16B, vtmpZ, vtmpZ);
lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
zip1(vtmp, T8B, vtmp, vtmpZ);
sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
add(cnt1, cnt1, 4);
fmovd(tmp1, vtmp);
} else { // UL case
ldr(tmp1, Address(str1));
ldrs(vtmp, Address(str2));
cmp(cnt2, stub_threshold);
br(GE, STUB);
subw(cnt2, cnt2, 4);
lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
eor(vtmpZ, T16B, vtmpZ, vtmpZ);
lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
zip1(vtmp, T8B, vtmp, vtmpZ);
sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
add(cnt1, cnt1, 8);
fmovd(tmp2, vtmp);
}
adds(cnt2, cnt2, isUL ? 4 : 8);
br(GE, TAIL);
eor(rscratch2, tmp1, tmp2);
cbnz(rscratch2, DIFF);
// main loop
bind(NEXT_WORD);
if (str1_isL == str2_isL) {
ldr(tmp1, Address(str1, cnt2));
ldr(tmp2, Address(str2, cnt2));
adds(cnt2, cnt2, 8);
} else if (isLU) {
ldrs(vtmp, Address(str1, cnt1));
ldr(tmp2, Address(str2, cnt2));
add(cnt1, cnt1, 4);
zip1(vtmp, T8B, vtmp, vtmpZ);
fmovd(tmp1, vtmp);
adds(cnt2, cnt2, 8);
} else { // UL
ldrs(vtmp, Address(str2, cnt2));
ldr(tmp1, Address(str1, cnt1));
zip1(vtmp, T8B, vtmp, vtmpZ);
add(cnt1, cnt1, 8);
fmovd(tmp2, vtmp);
adds(cnt2, cnt2, 4);
}
br(GE, TAIL);
eor(rscratch2, tmp1, tmp2);
cbz(rscratch2, NEXT_WORD);
b(DIFF);
bind(TAIL);
eor(rscratch2, tmp1, tmp2);
cbnz(rscratch2, DIFF);
// Last longword. In the case where length == 4 we compare the
// same longword twice, but that's still faster than another
// conditional branch.
if (str1_isL == str2_isL) {
ldr(tmp1, Address(str1));
ldr(tmp2, Address(str2));
} else if (isLU) {
ldrs(vtmp, Address(str1));
ldr(tmp2, Address(str2));
zip1(vtmp, T8B, vtmp, vtmpZ);
fmovd(tmp1, vtmp);
} else { // UL
ldrs(vtmp, Address(str2));
ldr(tmp1, Address(str1));
zip1(vtmp, T8B, vtmp, vtmpZ);
fmovd(tmp2, vtmp);
}
bind(TAIL_CHECK);
eor(rscratch2, tmp1, tmp2);
cbz(rscratch2, DONE);
// Find the first different characters in the longwords and
// compute their difference.
bind(DIFF);
rev(rscratch2, rscratch2);
clz(rscratch2, rscratch2);
andr(rscratch2, rscratch2, isLL ? -8 : -16);
lsrv(tmp1, tmp1, rscratch2);
(this->*ext_chr)(tmp1, tmp1);
lsrv(tmp2, tmp2, rscratch2);
(this->*ext_chr)(tmp2, tmp2);
subw(result, tmp1, tmp2);
b(DONE);
}
bind(STUB);
RuntimeAddress stub = nullptr;
switch(ae) {
case StrIntrinsicNode::LL:
stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
break;
case StrIntrinsicNode::UU:
stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
break;
case StrIntrinsicNode::LU:
stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
break;
case StrIntrinsicNode::UL:
stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
break;
default:
ShouldNotReachHere();
}
assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
address call = trampoline_call(stub);
if (call == nullptr) {
DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
ciEnv::current()->record_failure("CodeCache is full");
return;
}
b(DONE);
bind(SHORT_STRING);
// Is the minimum length zero?
cbz(cnt2, DONE);
// arrange code to do most branches while loading and loading next characters
// while comparing previous
(this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
subs(cnt2, cnt2, 1);
br(EQ, SHORT_LAST_INIT);
(this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
b(SHORT_LOOP_START);
bind(SHORT_LOOP);
subs(cnt2, cnt2, 1);
br(EQ, SHORT_LAST);
bind(SHORT_LOOP_START);
(this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
(this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
cmp(tmp1, cnt1);
br(NE, SHORT_LOOP_TAIL);
subs(cnt2, cnt2, 1);
br(EQ, SHORT_LAST2);
(this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
(this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
cmp(tmp2, rscratch1);
br(EQ, SHORT_LOOP);
sub(result, tmp2, rscratch1);
b(DONE);
bind(SHORT_LOOP_TAIL);
sub(result, tmp1, cnt1);
b(DONE);
bind(SHORT_LAST2);
cmp(tmp2, rscratch1);
br(EQ, DONE);
sub(result, tmp2, rscratch1);
b(DONE);
bind(SHORT_LAST_INIT);
(this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
bind(SHORT_LAST);
cmp(tmp1, cnt1);
br(EQ, DONE);
sub(result, tmp1, cnt1);
bind(DONE);
BLOCK_COMMENT("} string_compare");
}
void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
FloatRegister src2, Condition cond, bool isQ) {
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
FloatRegister zn = src1, zm = src2;
bool needs_negation = false;
switch (cond) {
case LT: cond = GT; zn = src2; zm = src1; break;
case LE: cond = GE; zn = src2; zm = src1; break;
case LO: cond = HI; zn = src2; zm = src1; break;
case LS: cond = HS; zn = src2; zm = src1; break;
case NE: cond = EQ; needs_negation = true; break;
default:
break;
}
if (is_floating_point_type(bt)) {
fcm(cond, dst, size, zn, zm);
} else {
cm(cond, dst, size, zn, zm);
}
if (needs_negation) {
notr(dst, isQ ? T16B : T8B, dst);
}
}
void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
Condition cond, bool isQ) {
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
if (bt == T_FLOAT || bt == T_DOUBLE) {
if (cond == Assembler::NE) {
fcm(Assembler::EQ, dst, size, src);
notr(dst, isQ ? T16B : T8B, dst);
} else {
fcm(cond, dst, size, src);
}
} else {
if (cond == Assembler::NE) {
cm(Assembler::EQ, dst, size, src);
notr(dst, isQ ? T16B : T8B, dst);
} else {
cm(cond, dst, size, src);
}
}
}
// Compress the least significant bit of each byte to the rightmost and clear
// the higher garbage bits.
void C2_MacroAssembler::bytemask_compress(Register dst) {
// Example input, dst = 0x01 00 00 00 01 01 00 01
// The "??" bytes are garbage.
orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
andr(dst, dst, 0xff); // dst = 0x8D
}
// Pack the lowest-numbered bit of each mask element in src into a long value
// in dst, at most the first 64 lane elements.
// Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
FloatRegister vtmp1, FloatRegister vtmp2) {
assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
assert_different_registers(dst, rscratch1);
assert_different_registers(vtmp1, vtmp2);
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
// Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
// Expected: dst = 0x658D
// Convert the mask into vector with sequential bytes.
// vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
sve_cpy(vtmp1, size, src, 1, false);
if (bt != T_BYTE) {
sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
}
if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
// Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
// is to compress each significant bit of the byte in a cross-lane way. Due
// to the lack of a cross-lane bit-compress instruction, we use BEXT
// (bit-compress in each lane) with the biggest lane size (T = D) then
// concatenate the results.
// The second source input of BEXT, initialized with 0x01 in each byte.
// vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
sve_dup(vtmp2, B, 1);
// BEXT vtmp1.D, vtmp1.D, vtmp2.D
// vtmp1 = 0x0001010000010001 | 0x0100000001010001
// vtmp2 = 0x0101010101010101 | 0x0101010101010101
// ---------------------------------------
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
sve_bext(vtmp1, D, vtmp1, vtmp2);
// Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
// result to dst.
// vtmp1 = 0x0000000000000000 | 0x000000000000658D
// dst = 0x658D
if (lane_cnt <= 8) {
// No need to concatenate.
umov(dst, vtmp1, B, 0);
} else if (lane_cnt <= 16) {
ins(vtmp1, B, vtmp1, 1, 8);
umov(dst, vtmp1, H, 0);
} else {
// As the lane count is 64 at most, the final expected value must be in
// the lowest 64 bits after narrowing vtmp1 from D to B.
sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
umov(dst, vtmp1, D, 0);
}
} else if (UseSVE > 0) {
// Compress the lowest 8 bytes.
fmovd(dst, vtmp1);
bytemask_compress(dst);
if (lane_cnt <= 8) return;
// Repeat on higher bytes and join the results.
// Compress 8 bytes in each iteration.
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
bytemask_compress(rscratch1);
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
}
} else {
assert(false, "unsupported");
ShouldNotReachHere();
}
}
// Unpack the mask, a long value in src, into predicate register dst based on the
// corresponding data type. Note that dst can support at most 64 lanes.
// Below example gives the expected dst predicate register in different types, with
// a valid src(0x658D) on a 1024-bit vector size machine.
// BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
// SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
// INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
// LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
//
// The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
// has 24 significant bits would be an invalid input if dst predicate register refers to
// a LONG type 1024-bit vector, which has at most 16 lanes.
void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
FloatRegister vtmp1, FloatRegister vtmp2) {
assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
// Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
// Expected: dst = 0b01101001 10001101
// Put long value from general purpose register into the first lane of vector.
// vtmp1 = 0x0000000000000000 | 0x000000000000658D
sve_dup(vtmp1, B, 0);
mov(vtmp1, D, 0, src);
// As sve_cmp generates mask value with the minimum unit in byte, we should
// transform the value in the first lane which is mask in bit now to the
// mask in byte, which can be done by SVE2's BDEP instruction.
// The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
if (lane_cnt <= 8) {
// Nothing. As only one byte exsits.
} else if (lane_cnt <= 16) {
ins(vtmp1, B, vtmp1, 8, 1);
mov(vtmp1, B, 1, zr);
} else {
sve_vector_extend(vtmp1, D, vtmp1, B);
}
// The second source input of BDEP instruction, initialized with 0x01 for each byte.
// vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
sve_dup(vtmp2, B, 1);
// BDEP vtmp1.D, vtmp1.D, vtmp2.D
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
// vtmp2 = 0x0101010101010101 | 0x0101010101010101
// ---------------------------------------
// vtmp1 = 0x0001010000010001 | 0x0100000001010001
sve_bdep(vtmp1, D, vtmp1, vtmp2);
if (bt != T_BYTE) {
sve_vector_extend(vtmp1, size, vtmp1, B);
}
// Generate mask according to the given vector, in which the elements have been
// extended to expected type.
// dst = 0b01101001 10001101
sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
}
// Clobbers: rflags
void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
FloatRegister zn, FloatRegister zm, Condition cond) {
assert(pg->is_governing(), "This register has to be a governing predicate register");
FloatRegister z1 = zn, z2 = zm;
switch (cond) {
case LE: z1 = zm; z2 = zn; cond = GE; break;
case LT: z1 = zm; z2 = zn; cond = GT; break;
case LO: z1 = zm; z2 = zn; cond = HI; break;
case LS: z1 = zm; z2 = zn; cond = HS; break;
default:
break;
}
SIMD_RegVariant size = elemType_to_regVariant(bt);
if (is_floating_point_type(bt)) {
sve_fcm(cond, pd, size, pg, z1, z2);
} else {
assert(is_integral_type(bt), "unsupported element type");
sve_cmp(cond, pd, size, pg, z1, z2);
}
}
// Get index of the last mask lane that is set
void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
SIMD_RegVariant size = elemType_to_regVariant(bt);
sve_rev(ptmp, size, src);
sve_brkb(ptmp, ptrue, ptmp, false);
sve_cntp(dst, size, ptrue, ptmp);
movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
subw(dst, rscratch1, dst);
}
// Extend integer vector src to dst with the same lane count
// but larger element size, e.g. 4B -> 4I
void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
FloatRegister src, BasicType src_bt) {
if (src_bt == T_BYTE) {
if (dst_bt == T_SHORT) {
// 4B/8B to 4S/8S
assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
sxtl(dst, T8H, src, T8B);
} else {
// 4B to 4I
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
sxtl(dst, T8H, src, T8B);
sxtl(dst, T4S, dst, T4H);
}
} else if (src_bt == T_SHORT) {
// 4S to 4I
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
sxtl(dst, T4S, src, T4H);
} else if (src_bt == T_INT) {
// 2I to 2L
assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
sxtl(dst, T2D, src, T2S);
} else {
ShouldNotReachHere();
}
}
// Narrow integer vector src down to dst with the same lane count
// but smaller element size, e.g. 4I -> 4B
void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
if (src_bt == T_SHORT) {
// 4S/8S to 4B/8B
assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
assert(dst_bt == T_BYTE, "unsupported");
xtn(dst, T8B, src, T8H);
} else if (src_bt == T_INT) {
// 4I to 4B/4S
assert(src_vlen_in_bytes == 16, "unsupported");
assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
xtn(dst, T4H, src, T4S);
if (dst_bt == T_BYTE) {
xtn(dst, T8B, dst, T8H);
}
} else if (src_bt == T_LONG) {
// 2L to 2I
assert(src_vlen_in_bytes == 16, "unsupported");
assert(dst_bt == T_INT, "unsupported");
xtn(dst, T2S, src, T2D);
} else {
ShouldNotReachHere();
}
}
void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
FloatRegister src, SIMD_RegVariant src_size) {
assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
if (src_size == B) {
switch (dst_size) {
case H:
sve_sunpklo(dst, H, src);
break;
case S:
sve_sunpklo(dst, H, src);
sve_sunpklo(dst, S, dst);
break;
case D:
sve_sunpklo(dst, H, src);
sve_sunpklo(dst, S, dst);
sve_sunpklo(dst, D, dst);
break;
default:
ShouldNotReachHere();
}
} else if (src_size == H) {
if (dst_size == S) {
sve_sunpklo(dst, S, src);
} else { // D
sve_sunpklo(dst, S, src);
sve_sunpklo(dst, D, dst);
}
} else if (src_size == S) {
sve_sunpklo(dst, D, src);
}
}
// Vector narrow from src to dst with specified element sizes.
// High part of dst vector will be filled with zero.
void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
FloatRegister src, SIMD_RegVariant src_size,
FloatRegister tmp) {
assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
assert_different_registers(src, tmp);
sve_dup(tmp, src_size, 0);
if (src_size == D) {
switch (dst_size) {
case S:
sve_uzp1(dst, S, src, tmp);
break;
case H:
assert_different_registers(dst, tmp);
sve_uzp1(dst, S, src, tmp);
sve_uzp1(dst, H, dst, tmp);
break;
case B:
assert_different_registers(dst, tmp);
sve_uzp1(dst, S, src, tmp);
sve_uzp1(dst, H, dst, tmp);
sve_uzp1(dst, B, dst, tmp);
break;
default:
ShouldNotReachHere();
}
} else if (src_size == S) {
if (dst_size == H) {
sve_uzp1(dst, H, src, tmp);
} else { // B
assert_different_registers(dst, tmp);
sve_uzp1(dst, H, src, tmp);
sve_uzp1(dst, B, dst, tmp);
}
} else if (src_size == H) {
sve_uzp1(dst, B, src, tmp);
}
}
// Extend src predicate to dst predicate with the same lane count but larger
// element size, e.g. 64Byte -> 512Long
void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
uint dst_element_length_in_bytes,
uint src_element_length_in_bytes) {
if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
sve_punpklo(dst, src);
} else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
sve_punpklo(dst, src);
sve_punpklo(dst, dst);
} else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
sve_punpklo(dst, src);
sve_punpklo(dst, dst);
sve_punpklo(dst, dst);
} else {
assert(false, "unsupported");
ShouldNotReachHere();
}
}
// Narrow src predicate to dst predicate with the same lane count but
// smaller element size, e.g. 512Long -> 64Byte
void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
// The insignificant bits in src predicate are expected to be zero.
// To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
// passed as the second argument. An example narrowing operation with a given mask would be -
// 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
// Mask (for 2 Longs) : TF
// Predicate register for the above mask (16 bits) : 00000001 00000000
// After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
// Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
assert_different_registers(src, ptmp);
assert_different_registers(dst, ptmp);
sve_pfalse(ptmp);
if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
sve_uzp1(dst, B, src, ptmp);
} else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
sve_uzp1(dst, H, src, ptmp);
sve_uzp1(dst, B, dst, ptmp);
} else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
sve_uzp1(dst, S, src, ptmp);
sve_uzp1(dst, H, dst, ptmp);
sve_uzp1(dst, B, dst, ptmp);
} else {
assert(false, "unsupported");
ShouldNotReachHere();
}
}
// Vector reduction add for integral type with ASIMD instructions.
void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes,
FloatRegister vtmp) {
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
assert_different_registers(dst, isrc);
bool isQ = vector_length_in_bytes == 16;
BLOCK_COMMENT("neon_reduce_add_integral {");
switch(bt) {
case T_BYTE:
addv(vtmp, isQ ? T16B : T8B, vsrc);
smov(dst, vtmp, B, 0);
addw(dst, dst, isrc, ext::sxtb);
break;
case T_SHORT:
addv(vtmp, isQ ? T8H : T4H, vsrc);
smov(dst, vtmp, H, 0);
addw(dst, dst, isrc, ext::sxth);
break;
case T_INT:
isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
umov(dst, vtmp, S, 0);
addw(dst, dst, isrc);
break;
case T_LONG:
assert(isQ, "unsupported");
addpd(vtmp, vsrc);
umov(dst, vtmp, D, 0);
add(dst, dst, isrc);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
BLOCK_COMMENT("} neon_reduce_add_integral");
}
// Vector reduction multiply for integral type with ASIMD instructions.
// Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
// Clobbers: rscratch1
void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes,
FloatRegister vtmp1, FloatRegister vtmp2) {
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
bool isQ = vector_length_in_bytes == 16;
BLOCK_COMMENT("neon_reduce_mul_integral {");
switch(bt) {
case T_BYTE:
if (isQ) {
// Multiply the lower half and higher half of vector iteratively.
// vtmp1 = vsrc[8:15]
ins(vtmp1, D, vsrc, 0, 1);
// vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
mulv(vtmp1, T8B, vtmp1, vsrc);
// vtmp2 = vtmp1[4:7]
ins(vtmp2, S, vtmp1, 0, 1);
// vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
mulv(vtmp1, T8B, vtmp2, vtmp1);
} else {
ins(vtmp1, S, vsrc, 0, 1);
mulv(vtmp1, T8B, vtmp1, vsrc);
}
// vtmp2 = vtmp1[2:3]
ins(vtmp2, H, vtmp1, 0, 1);
// vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
mulv(vtmp2, T8B, vtmp2, vtmp1);
// dst = vtmp2[0] * isrc * vtmp2[1]
umov(rscratch1, vtmp2, B, 0);
mulw(dst, rscratch1, isrc);
sxtb(dst, dst);
umov(rscratch1, vtmp2, B, 1);
mulw(dst, rscratch1, dst);
sxtb(dst, dst);
break;
case T_SHORT:
if (isQ) {
ins(vtmp2, D, vsrc, 0, 1);
mulv(vtmp2, T4H, vtmp2, vsrc);
ins(vtmp1, S, vtmp2, 0, 1);
mulv(vtmp1, T4H, vtmp1, vtmp2);
} else {
ins(vtmp1, S, vsrc, 0, 1);
mulv(vtmp1, T4H, vtmp1, vsrc);
}
umov(rscratch1, vtmp1, H, 0);
mulw(dst, rscratch1, isrc);
sxth(dst, dst);
umov(rscratch1, vtmp1, H, 1);
mulw(dst, rscratch1, dst);
sxth(dst, dst);
break;
case T_INT:
if (isQ) {
ins(vtmp1, D, vsrc, 0, 1);
mulv(vtmp1, T2S, vtmp1, vsrc);
} else {
vtmp1 = vsrc;
}
umov(rscratch1, vtmp1, S, 0);
mul(dst, rscratch1, isrc);
umov(rscratch1, vtmp1, S, 1);
mul(dst, rscratch1, dst);
break;
case T_LONG:
umov(rscratch1, vsrc, D, 0);
mul(dst, isrc, rscratch1);
umov(rscratch1, vsrc, D, 1);
mul(dst, dst, rscratch1);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
BLOCK_COMMENT("} neon_reduce_mul_integral");
}
// Vector reduction multiply for floating-point type with ASIMD instructions.
void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
FloatRegister fsrc, FloatRegister vsrc,
unsigned vector_length_in_bytes,
FloatRegister vtmp) {
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
bool isQ = vector_length_in_bytes == 16;
BLOCK_COMMENT("neon_reduce_mul_fp {");
switch(bt) {
case T_FLOAT:
fmuls(dst, fsrc, vsrc);
ins(vtmp, S, vsrc, 0, 1);
fmuls(dst, dst, vtmp);
if (isQ) {
ins(vtmp, S, vsrc, 0, 2);
fmuls(dst, dst, vtmp);
ins(vtmp, S, vsrc, 0, 3);
fmuls(dst, dst, vtmp);
}
break;
case T_DOUBLE:
assert(isQ, "unsupported");
fmuld(dst, fsrc, vsrc);
ins(vtmp, D, vsrc, 0, 1);
fmuld(dst, dst, vtmp);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
BLOCK_COMMENT("} neon_reduce_mul_fp");
}
// Helper to select logical instruction
void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
Register Rn, Register Rm,
enum shift_kind kind, unsigned shift) {
switch(opc) {
case Op_AndReductionV:
is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
break;
case Op_OrReductionV:
is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
break;
case Op_XorReductionV:
is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
}
// Vector reduction logical operations And, Or, Xor
// Clobbers: rscratch1
void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes) {
assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
"unsupported");
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
assert_different_registers(dst, isrc);
bool isQ = vector_length_in_bytes == 16;
BLOCK_COMMENT("neon_reduce_logical {");
umov(rscratch1, vsrc, isQ ? D : S, 0);
umov(dst, vsrc, isQ ? D : S, 1);
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
switch(bt) {
case T_BYTE:
if (isQ) {
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
}
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
sxtb(dst, dst);
break;
case T_SHORT:
if (isQ) {
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
}
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
sxth(dst, dst);
break;
case T_INT:
if (isQ) {
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
}
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
break;
case T_LONG:
assert(isQ, "unsupported");
neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
BLOCK_COMMENT("} neon_reduce_logical");
}
// Vector reduction min/max for integral type with ASIMD instructions.
// Note: vtmp is not used and expected to be fnoreg for T_LONG case.
// Clobbers: rscratch1, rflags
void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
Register isrc, FloatRegister vsrc,
unsigned vector_length_in_bytes,
FloatRegister vtmp) {
assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
assert_different_registers(dst, isrc);
bool isQ = vector_length_in_bytes == 16;
bool is_min = opc == Op_MinReductionV;
BLOCK_COMMENT("neon_reduce_minmax_integral {");
if (bt == T_LONG) {
assert(vtmp == fnoreg, "should be");
assert(isQ, "should be");
umov(rscratch1, vsrc, D, 0);
cmp(isrc, rscratch1);
csel(dst, isrc, rscratch1, is_min ? LT : GT);
umov(rscratch1, vsrc, D, 1);
cmp(dst, rscratch1);
csel(dst, dst, rscratch1, is_min ? LT : GT);
} else {
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
if (size == T2S) {
is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
} else {
is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
}
if (bt == T_INT) {
umov(dst, vtmp, S, 0);
} else {
smov(dst, vtmp, elemType_to_regVariant(bt), 0);
}
cmpw(dst, isrc);
cselw(dst, dst, isrc, is_min ? LT : GT);
}
BLOCK_COMMENT("} neon_reduce_minmax_integral");
}
// Vector reduction for integral type with SVE instruction.
// Supported operations are Add, And, Or, Xor, Max, Min.
// rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
FloatRegister src2, PRegister pg, FloatRegister tmp) {
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
assert(pg->is_governing(), "This register has to be a governing predicate register");
assert_different_registers(src1, dst);
// Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
switch (opc) {
case Op_AddReductionVI: {
sve_uaddv(tmp, size, pg, src2);
if (bt == T_BYTE) {
smov(dst, tmp, size, 0);
addw(dst, src1, dst, ext::sxtb);
} else if (bt == T_SHORT) {
smov(dst, tmp, size, 0);
addw(dst, src1, dst, ext::sxth);
} else {
umov(dst, tmp, size, 0);
addw(dst, dst, src1);
}
break;
}
case Op_AddReductionVL: {
sve_uaddv(tmp, size, pg, src2);
umov(dst, tmp, size, 0);
add(dst, dst, src1);
break;
}
case Op_AndReductionV: {
sve_andv(tmp, size, pg, src2);
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
andr(dst, dst, src1);
} else {
andw(dst, dst, src1);
}
break;
}
case Op_OrReductionV: {
sve_orv(tmp, size, pg, src2);
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
orr(dst, dst, src1);
} else {
orrw(dst, dst, src1);
}
break;
}
case Op_XorReductionV: {
sve_eorv(tmp, size, pg, src2);
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
eor(dst, dst, src1);
} else {
eorw(dst, dst, src1);
}
break;
}
case Op_MaxReductionV: {
sve_smaxv(tmp, size, pg, src2);
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
cmp(dst, src1);
csel(dst, dst, src1, Assembler::GT);
} else {
cmpw(dst, src1);
cselw(dst, dst, src1, Assembler::GT);
}
break;
}
case Op_MinReductionV: {
sve_sminv(tmp, size, pg, src2);
if (bt == T_INT || bt == T_LONG) {
umov(dst, tmp, size, 0);
} else {
smov(dst, tmp, size, 0);
}
if (bt == T_LONG) {
cmp(dst, src1);
csel(dst, dst, src1, Assembler::LT);
} else {
cmpw(dst, src1);
cselw(dst, dst, src1, Assembler::LT);
}
break;
}
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
if (bt == T_BYTE) {
sxtb(dst, dst);
} else if (bt == T_SHORT) {
sxth(dst, dst);
}
}
}
// Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
// to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
// max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
uint32_t max_vector_length = Matcher::max_vector_size(bt);
assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
// Set all elements to false if the input "lane_cnt" is zero.
if (lane_cnt == 0) {
sve_pfalse(dst);
return;
}
SIMD_RegVariant size = elemType_to_regVariant(bt);
assert(size != Q, "invalid size");
// Set all true if "lane_cnt" equals to the max lane count.
if (lane_cnt == max_vector_length) {
sve_ptrue(dst, size, /* ALL */ 0b11111);
return;
}
// Fixed numbers for "ptrue".
switch(lane_cnt) {
case 1: /* VL1 */
case 2: /* VL2 */
case 3: /* VL3 */
case 4: /* VL4 */
case 5: /* VL5 */
case 6: /* VL6 */
case 7: /* VL7 */
case 8: /* VL8 */
sve_ptrue(dst, size, lane_cnt);
return;
case 16:
sve_ptrue(dst, size, /* VL16 */ 0b01001);
return;
case 32:
sve_ptrue(dst, size, /* VL32 */ 0b01010);
return;
case 64:
sve_ptrue(dst, size, /* VL64 */ 0b01011);
return;
case 128:
sve_ptrue(dst, size, /* VL128 */ 0b01100);
return;
case 256:
sve_ptrue(dst, size, /* VL256 */ 0b01101);
return;
default:
break;
}
// Special patterns for "ptrue".
if (lane_cnt == round_down_power_of_2(max_vector_length)) {
sve_ptrue(dst, size, /* POW2 */ 0b00000);
} else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
sve_ptrue(dst, size, /* MUL4 */ 0b11101);
} else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
sve_ptrue(dst, size, /* MUL3 */ 0b11110);
} else {
// Encode to "whileltw" for the remaining cases.
mov(rscratch1, lane_cnt);
sve_whileltw(dst, size, zr, rscratch1);
}
}
// Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
// Any remaining elements of dst will be filled with zero.
// Clobbers: rscratch1
// Preserves: src, mask
void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
FloatRegister vtmp1, FloatRegister vtmp2,
PRegister pgtmp) {
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
assert_different_registers(dst, src, vtmp1, vtmp2);
assert_different_registers(mask, pgtmp);
// Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111
// mask = 0001 0000 0000 0001 0001 0000 0001 0001
// Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111
sve_dup(vtmp2, H, 0);
// Extend lowest half to type INT.
// dst = 00004444 00003333 00002222 00001111
sve_uunpklo(dst, S, src);
// pgtmp = 00000001 00000000 00000001 00000001
sve_punpklo(pgtmp, mask);
// Pack the active elements in size of type INT to the right,
// and fill the remainings with zero.
// dst = 00000000 00004444 00002222 00001111
sve_compact(dst, S, dst, pgtmp);
// Narrow the result back to type SHORT.
// dst = 0000 0000 0000 0000 0000 4444 2222 1111
sve_uzp1(dst, H, dst, vtmp2);
// Count the active elements of lowest half.
// rscratch1 = 3
sve_cntp(rscratch1, S, ptrue, pgtmp);
// Repeat to the highest half.
// pgtmp = 00000001 00000000 00000000 00000001
sve_punpkhi(pgtmp, mask);
// vtmp1 = 00008888 00007777 00006666 00005555
sve_uunpkhi(vtmp1, S, src);
// vtmp1 = 00000000 00000000 00008888 00005555
sve_compact(vtmp1, S, vtmp1, pgtmp);
// vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
sve_uzp1(vtmp1, H, vtmp1, vtmp2);
// Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111
// Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
// Left shift(cross lane) compressed high with TRUE_CNT lanes,
// TRUE_CNT is the number of active elements in the compressed low.
neg(rscratch1, rscratch1);
// vtmp2 = {4 3 2 1 0 -1 -2 -3}
sve_index(vtmp2, H, rscratch1, 1);
// vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
sve_tbl(vtmp1, H, vtmp1, vtmp2);
// Combine the compressed high(after shifted) with the compressed low.
// dst = 0000 0000 0000 8888 5555 4444 2222 1111
sve_orr(dst, dst, vtmp1);
}
// Clobbers: rscratch1, rscratch2
// Preserves: src, mask
void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
FloatRegister vtmp1, FloatRegister vtmp2,
FloatRegister vtmp3, FloatRegister vtmp4,
PRegister ptmp, PRegister pgtmp) {
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
assert_different_registers(mask, ptmp, pgtmp);
// Example input: src = 88 77 66 55 44 33 22 11
// mask = 01 00 00 01 01 00 01 01
// Expected result: dst = 00 00 00 88 55 44 22 11
sve_dup(vtmp4, B, 0);
// Extend lowest half to type SHORT.
// vtmp1 = 0044 0033 0022 0011
sve_uunpklo(vtmp1, H, src);
// ptmp = 0001 0000 0001 0001
sve_punpklo(ptmp, mask);
// Count the active elements of lowest half.
// rscratch2 = 3
sve_cntp(rscratch2, H, ptrue, ptmp);
// Pack the active elements in size of type SHORT to the right,
// and fill the remainings with zero.
// dst = 0000 0044 0022 0011
sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
// Narrow the result back to type BYTE.
// dst = 00 00 00 00 00 44 22 11
sve_uzp1(dst, B, dst, vtmp4);
// Repeat to the highest half.
// ptmp = 0001 0000 0000 0001
sve_punpkhi(ptmp, mask);
// vtmp1 = 0088 0077 0066 0055
sve_uunpkhi(vtmp2, H, src);
// vtmp1 = 0000 0000 0088 0055
sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
sve_dup(vtmp4, B, 0);
// vtmp1 = 00 00 00 00 00 00 88 55
sve_uzp1(vtmp1, B, vtmp1, vtmp4);
// Compressed low: dst = 00 00 00 00 00 44 22 11
// Compressed high: vtmp1 = 00 00 00 00 00 00 88 55
// Left shift(cross lane) compressed high with TRUE_CNT lanes,
// TRUE_CNT is the number of active elements in the compressed low.
neg(rscratch2, rscratch2);
// vtmp2 = {4 3 2 1 0 -1 -2 -3}
sve_index(vtmp2, B, rscratch2, 1);
// vtmp1 = 00 00 00 88 55 00 00 00
sve_tbl(vtmp1, B, vtmp1, vtmp2);
// Combine the compressed high(after shifted) with the compressed low.
// dst = 00 00 00 88 55 44 22 11
sve_orr(dst, dst, vtmp1);
}
void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
SIMD_Arrangement size = isQ ? T16B : T8B;
if (bt == T_BYTE) {
rbit(dst, size, src);
} else {
neon_reverse_bytes(dst, src, bt, isQ);
rbit(dst, size, dst);
}
}
void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
SIMD_Arrangement size = isQ ? T16B : T8B;
switch (bt) {
case T_BYTE:
if (dst != src) {
orr(dst, size, src, src);
}
break;
case T_SHORT:
rev16(dst, size, src);
break;
case T_INT:
rev32(dst, size, src);
break;
case T_LONG:
rev64(dst, size, src);
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
}
// Extract a scalar element from an sve vector at position 'idx'.
// The input elements in src are expected to be of integral type.
void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
int idx, FloatRegister vtmp) {
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
if (bt == T_INT || bt == T_LONG) {
umov(dst, src, size, idx);
} else {
smov(dst, src, size, idx);
}
} else {
sve_orr(vtmp, src, src);
sve_ext(vtmp, vtmp, idx << size);
if (bt == T_INT || bt == T_LONG) {
umov(dst, vtmp, size, 0);
} else {
smov(dst, vtmp, size, 0);
}
}
}
// java.lang.Math::round intrinsics
// Clobbers: rscratch1, rflags
void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
assert_different_registers(tmp1, tmp2, tmp3, src, dst);
switch (T) {
case T2S:
case T4S:
fmovs(tmp1, T, 0.5f);
mov(rscratch1, jint_cast(0x1.0p23f));
break;
case T2D:
fmovd(tmp1, T, 0.5);
mov(rscratch1, julong_cast(0x1.0p52));
break;
default:
assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
}
fadd(tmp1, T, tmp1, src);
fcvtms(tmp1, T, tmp1);
// tmp1 = floor(src + 0.5, ties to even)
fcvtas(dst, T, src);
// dst = round(src), ties to away
fneg(tmp3, T, src);
dup(tmp2, T, rscratch1);
cm(HS, tmp3, T, tmp3, tmp2);
// tmp3 is now a set of flags
bif(dst, T16B, tmp1, tmp3);
// result in dst
}
// Clobbers: rscratch1, rflags
void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
assert_different_registers(tmp1, tmp2, src, dst);
switch (T) {
case S:
mov(rscratch1, jint_cast(0x1.0p23f));
break;
case D:
mov(rscratch1, julong_cast(0x1.0p52));
break;
default:
assert(T == S || T == D, "invalid register variant");
}
sve_frinta(dst, T, ptrue, src);
// dst = round(src), ties to away
Label none;
sve_fneg(tmp1, T, ptrue, src);
sve_dup(tmp2, T, rscratch1);
sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
br(EQ, none);
{
sve_cpy(tmp1, T, pgtmp, 0.5);
sve_fadd(tmp1, T, pgtmp, src);
sve_frintm(dst, T, pgtmp, tmp1);
// dst = floor(src + 0.5, ties to even)
}
bind(none);
sve_fcvtzs(dst, T, ptrue, dst, T);
// result in dst
}
void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
FloatRegister one, SIMD_Arrangement T) {
assert_different_registers(dst, src, zero, one);
assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
facgt(dst, T, src, zero);
ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
}
void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
assert_different_registers(dst, src, zero, one, vtmp);
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
sve_orr(vtmp, src, src);
sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
switch (T) {
case S:
sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
// on the sign of the float value
break;
case D:
sve_and(vtmp, T, min_jlong);
sve_orr(vtmp, T, jlong_cast(1.0));
break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
}
sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
// Result in dst
}
bool C2_MacroAssembler::in_scratch_emit_size() {
if (ciEnv::current()->task() != nullptr) {
PhaseOutput* phase_output = Compile::current()->output();
if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
return true;
}
}
return MacroAssembler::in_scratch_emit_size();
}