| /* |
| * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. |
| * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. |
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| * |
| * This code is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 only, as |
| * published by the Free Software Foundation. |
| * |
| * This code is distributed in the hope that it will be useful, but WITHOUT |
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| * version 2 for more details (a copy is included in the LICENSE file that |
| * accompanied this code). |
| * |
| * You should have received a copy of the GNU General Public License version |
| * 2 along with this work; if not, write to the Free Software Foundation, |
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| * |
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| * or visit www.oracle.com if you need additional information or have any |
| * questions. |
| * |
| */ |
| |
| #include "precompiled.hpp" |
| #include "asm/macroAssembler.hpp" |
| #include "asm/macroAssembler.inline.hpp" |
| #include "asm/register.hpp" |
| #include "atomic_aarch64.hpp" |
| #include "compiler/oopMap.hpp" |
| #include "gc/shared/barrierSet.hpp" |
| #include "gc/shared/barrierSetAssembler.hpp" |
| #include "gc/shared/gc_globals.hpp" |
| #include "gc/shared/tlab_globals.hpp" |
| #include "interpreter/interpreter.hpp" |
| #include "memory/universe.hpp" |
| #include "nativeInst_aarch64.hpp" |
| #include "oops/instanceOop.hpp" |
| #include "oops/method.hpp" |
| #include "oops/objArrayKlass.hpp" |
| #include "oops/oop.inline.hpp" |
| #include "prims/methodHandles.hpp" |
| #include "runtime/atomic.hpp" |
| #include "runtime/continuation.hpp" |
| #include "runtime/continuationEntry.inline.hpp" |
| #include "runtime/frame.inline.hpp" |
| #include "runtime/handles.inline.hpp" |
| #include "runtime/javaThread.hpp" |
| #include "runtime/sharedRuntime.hpp" |
| #include "runtime/stubCodeGenerator.hpp" |
| #include "runtime/stubRoutines.hpp" |
| #include "utilities/align.hpp" |
| #include "utilities/globalDefinitions.hpp" |
| #include "utilities/powerOfTwo.hpp" |
| #ifdef COMPILER2 |
| #include "opto/runtime.hpp" |
| #endif |
| #if INCLUDE_ZGC |
| #include "gc/z/zThreadLocalData.hpp" |
| #endif |
| |
| // Declaration and definition of StubGenerator (no .hpp file). |
| // For a more detailed description of the stub routine structure |
| // see the comment in stubRoutines.hpp |
| |
| #undef __ |
| #define __ _masm-> |
| |
| #ifdef PRODUCT |
| #define BLOCK_COMMENT(str) /* nothing */ |
| #else |
| #define BLOCK_COMMENT(str) __ block_comment(str) |
| #endif |
| |
| #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") |
| |
| // Stub Code definitions |
| |
| class StubGenerator: public StubCodeGenerator { |
| private: |
| |
| #ifdef PRODUCT |
| #define inc_counter_np(counter) ((void)0) |
| #else |
| void inc_counter_np_(int& counter) { |
| __ lea(rscratch2, ExternalAddress((address)&counter)); |
| __ ldrw(rscratch1, Address(rscratch2)); |
| __ addw(rscratch1, rscratch1, 1); |
| __ strw(rscratch1, Address(rscratch2)); |
| } |
| #define inc_counter_np(counter) \ |
| BLOCK_COMMENT("inc_counter " #counter); \ |
| inc_counter_np_(counter); |
| #endif |
| |
| // Call stubs are used to call Java from C |
| // |
| // Arguments: |
| // c_rarg0: call wrapper address address |
| // c_rarg1: result address |
| // c_rarg2: result type BasicType |
| // c_rarg3: method Method* |
| // c_rarg4: (interpreter) entry point address |
| // c_rarg5: parameters intptr_t* |
| // c_rarg6: parameter size (in words) int |
| // c_rarg7: thread Thread* |
| // |
| // There is no return from the stub itself as any Java result |
| // is written to result |
| // |
| // we save r30 (lr) as the return PC at the base of the frame and |
| // link r29 (fp) below it as the frame pointer installing sp (r31) |
| // into fp. |
| // |
| // we save r0-r7, which accounts for all the c arguments. |
| // |
| // TODO: strictly do we need to save them all? they are treated as |
| // volatile by C so could we omit saving the ones we are going to |
| // place in global registers (thread? method?) or those we only use |
| // during setup of the Java call? |
| // |
| // we don't need to save r8 which C uses as an indirect result location |
| // return register. |
| // |
| // we don't need to save r9-r15 which both C and Java treat as |
| // volatile |
| // |
| // we don't need to save r16-18 because Java does not use them |
| // |
| // we save r19-r28 which Java uses as scratch registers and C |
| // expects to be callee-save |
| // |
| // we save the bottom 64 bits of each value stored in v8-v15; it is |
| // the responsibility of the caller to preserve larger values. |
| // |
| // so the stub frame looks like this when we enter Java code |
| // |
| // [ return_from_Java ] <--- sp |
| // [ argument word n ] |
| // ... |
| // -27 [ argument word 1 ] |
| // -26 [ saved v15 ] <--- sp_after_call |
| // -25 [ saved v14 ] |
| // -24 [ saved v13 ] |
| // -23 [ saved v12 ] |
| // -22 [ saved v11 ] |
| // -21 [ saved v10 ] |
| // -20 [ saved v9 ] |
| // -19 [ saved v8 ] |
| // -18 [ saved r28 ] |
| // -17 [ saved r27 ] |
| // -16 [ saved r26 ] |
| // -15 [ saved r25 ] |
| // -14 [ saved r24 ] |
| // -13 [ saved r23 ] |
| // -12 [ saved r22 ] |
| // -11 [ saved r21 ] |
| // -10 [ saved r20 ] |
| // -9 [ saved r19 ] |
| // -8 [ call wrapper (r0) ] |
| // -7 [ result (r1) ] |
| // -6 [ result type (r2) ] |
| // -5 [ method (r3) ] |
| // -4 [ entry point (r4) ] |
| // -3 [ parameters (r5) ] |
| // -2 [ parameter size (r6) ] |
| // -1 [ thread (r7) ] |
| // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) |
| // 1 [ saved lr (r30) ] |
| |
| // Call stub stack layout word offsets from fp |
| enum call_stub_layout { |
| sp_after_call_off = -26, |
| |
| d15_off = -26, |
| d13_off = -24, |
| d11_off = -22, |
| d9_off = -20, |
| |
| r28_off = -18, |
| r26_off = -16, |
| r24_off = -14, |
| r22_off = -12, |
| r20_off = -10, |
| call_wrapper_off = -8, |
| result_off = -7, |
| result_type_off = -6, |
| method_off = -5, |
| entry_point_off = -4, |
| parameter_size_off = -2, |
| thread_off = -1, |
| fp_f = 0, |
| retaddr_off = 1, |
| }; |
| |
| address generate_call_stub(address& return_address) { |
| assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && |
| (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, |
| "adjust this code"); |
| |
| StubCodeMark mark(this, "StubRoutines", "call_stub"); |
| address start = __ pc(); |
| |
| const Address sp_after_call(rfp, sp_after_call_off * wordSize); |
| |
| const Address call_wrapper (rfp, call_wrapper_off * wordSize); |
| const Address result (rfp, result_off * wordSize); |
| const Address result_type (rfp, result_type_off * wordSize); |
| const Address method (rfp, method_off * wordSize); |
| const Address entry_point (rfp, entry_point_off * wordSize); |
| const Address parameter_size(rfp, parameter_size_off * wordSize); |
| |
| const Address thread (rfp, thread_off * wordSize); |
| |
| const Address d15_save (rfp, d15_off * wordSize); |
| const Address d13_save (rfp, d13_off * wordSize); |
| const Address d11_save (rfp, d11_off * wordSize); |
| const Address d9_save (rfp, d9_off * wordSize); |
| |
| const Address r28_save (rfp, r28_off * wordSize); |
| const Address r26_save (rfp, r26_off * wordSize); |
| const Address r24_save (rfp, r24_off * wordSize); |
| const Address r22_save (rfp, r22_off * wordSize); |
| const Address r20_save (rfp, r20_off * wordSize); |
| |
| // stub code |
| |
| address aarch64_entry = __ pc(); |
| |
| // set up frame and move sp to end of save area |
| __ enter(); |
| __ sub(sp, rfp, -sp_after_call_off * wordSize); |
| |
| // save register parameters and Java scratch/global registers |
| // n.b. we save thread even though it gets installed in |
| // rthread because we want to sanity check rthread later |
| __ str(c_rarg7, thread); |
| __ strw(c_rarg6, parameter_size); |
| __ stp(c_rarg4, c_rarg5, entry_point); |
| __ stp(c_rarg2, c_rarg3, result_type); |
| __ stp(c_rarg0, c_rarg1, call_wrapper); |
| |
| __ stp(r20, r19, r20_save); |
| __ stp(r22, r21, r22_save); |
| __ stp(r24, r23, r24_save); |
| __ stp(r26, r25, r26_save); |
| __ stp(r28, r27, r28_save); |
| |
| __ stpd(v9, v8, d9_save); |
| __ stpd(v11, v10, d11_save); |
| __ stpd(v13, v12, d13_save); |
| __ stpd(v15, v14, d15_save); |
| |
| // install Java thread in global register now we have saved |
| // whatever value it held |
| __ mov(rthread, c_rarg7); |
| // And method |
| __ mov(rmethod, c_rarg3); |
| |
| // set up the heapbase register |
| __ reinit_heapbase(); |
| |
| #ifdef ASSERT |
| // make sure we have no pending exceptions |
| { |
| Label L; |
| __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); |
| __ cmp(rscratch1, (u1)NULL_WORD); |
| __ br(Assembler::EQ, L); |
| __ stop("StubRoutines::call_stub: entered with pending exception"); |
| __ BIND(L); |
| } |
| #endif |
| // pass parameters if any |
| __ mov(esp, sp); |
| __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way |
| __ andr(sp, rscratch1, -2 * wordSize); |
| |
| BLOCK_COMMENT("pass parameters if any"); |
| Label parameters_done; |
| // parameter count is still in c_rarg6 |
| // and parameter pointer identifying param 1 is in c_rarg5 |
| __ cbzw(c_rarg6, parameters_done); |
| |
| address loop = __ pc(); |
| __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); |
| __ subsw(c_rarg6, c_rarg6, 1); |
| __ push(rscratch1); |
| __ br(Assembler::GT, loop); |
| |
| __ BIND(parameters_done); |
| |
| // call Java entry -- passing methdoOop, and current sp |
| // rmethod: Method* |
| // r19_sender_sp: sender sp |
| BLOCK_COMMENT("call Java function"); |
| __ mov(r19_sender_sp, sp); |
| __ blr(c_rarg4); |
| |
| // we do this here because the notify will already have been done |
| // if we get to the next instruction via an exception |
| // |
| // n.b. adding this instruction here affects the calculation of |
| // whether or not a routine returns to the call stub (used when |
| // doing stack walks) since the normal test is to check the return |
| // pc against the address saved below. so we may need to allow for |
| // this extra instruction in the check. |
| |
| // save current address for use by exception handling code |
| |
| return_address = __ pc(); |
| |
| // store result depending on type (everything that is not |
| // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) |
| // n.b. this assumes Java returns an integral result in r0 |
| // and a floating result in j_farg0 |
| __ ldr(j_rarg2, result); |
| Label is_long, is_float, is_double, exit; |
| __ ldr(j_rarg1, result_type); |
| __ cmp(j_rarg1, (u1)T_OBJECT); |
| __ br(Assembler::EQ, is_long); |
| __ cmp(j_rarg1, (u1)T_LONG); |
| __ br(Assembler::EQ, is_long); |
| __ cmp(j_rarg1, (u1)T_FLOAT); |
| __ br(Assembler::EQ, is_float); |
| __ cmp(j_rarg1, (u1)T_DOUBLE); |
| __ br(Assembler::EQ, is_double); |
| |
| // handle T_INT case |
| __ strw(r0, Address(j_rarg2)); |
| |
| __ BIND(exit); |
| |
| // pop parameters |
| __ sub(esp, rfp, -sp_after_call_off * wordSize); |
| |
| #ifdef ASSERT |
| // verify that threads correspond |
| { |
| Label L, S; |
| __ ldr(rscratch1, thread); |
| __ cmp(rthread, rscratch1); |
| __ br(Assembler::NE, S); |
| __ get_thread(rscratch1); |
| __ cmp(rthread, rscratch1); |
| __ br(Assembler::EQ, L); |
| __ BIND(S); |
| __ stop("StubRoutines::call_stub: threads must correspond"); |
| __ BIND(L); |
| } |
| #endif |
| |
| __ pop_cont_fastpath(rthread); |
| |
| // restore callee-save registers |
| __ ldpd(v15, v14, d15_save); |
| __ ldpd(v13, v12, d13_save); |
| __ ldpd(v11, v10, d11_save); |
| __ ldpd(v9, v8, d9_save); |
| |
| __ ldp(r28, r27, r28_save); |
| __ ldp(r26, r25, r26_save); |
| __ ldp(r24, r23, r24_save); |
| __ ldp(r22, r21, r22_save); |
| __ ldp(r20, r19, r20_save); |
| |
| __ ldp(c_rarg0, c_rarg1, call_wrapper); |
| __ ldrw(c_rarg2, result_type); |
| __ ldr(c_rarg3, method); |
| __ ldp(c_rarg4, c_rarg5, entry_point); |
| __ ldp(c_rarg6, c_rarg7, parameter_size); |
| |
| // leave frame and return to caller |
| __ leave(); |
| __ ret(lr); |
| |
| // handle return types different from T_INT |
| |
| __ BIND(is_long); |
| __ str(r0, Address(j_rarg2, 0)); |
| __ br(Assembler::AL, exit); |
| |
| __ BIND(is_float); |
| __ strs(j_farg0, Address(j_rarg2, 0)); |
| __ br(Assembler::AL, exit); |
| |
| __ BIND(is_double); |
| __ strd(j_farg0, Address(j_rarg2, 0)); |
| __ br(Assembler::AL, exit); |
| |
| return start; |
| } |
| |
| // Return point for a Java call if there's an exception thrown in |
| // Java code. The exception is caught and transformed into a |
| // pending exception stored in JavaThread that can be tested from |
| // within the VM. |
| // |
| // Note: Usually the parameters are removed by the callee. In case |
| // of an exception crossing an activation frame boundary, that is |
| // not the case if the callee is compiled code => need to setup the |
| // rsp. |
| // |
| // r0: exception oop |
| |
| address generate_catch_exception() { |
| StubCodeMark mark(this, "StubRoutines", "catch_exception"); |
| address start = __ pc(); |
| |
| // same as in generate_call_stub(): |
| const Address sp_after_call(rfp, sp_after_call_off * wordSize); |
| const Address thread (rfp, thread_off * wordSize); |
| |
| #ifdef ASSERT |
| // verify that threads correspond |
| { |
| Label L, S; |
| __ ldr(rscratch1, thread); |
| __ cmp(rthread, rscratch1); |
| __ br(Assembler::NE, S); |
| __ get_thread(rscratch1); |
| __ cmp(rthread, rscratch1); |
| __ br(Assembler::EQ, L); |
| __ bind(S); |
| __ stop("StubRoutines::catch_exception: threads must correspond"); |
| __ bind(L); |
| } |
| #endif |
| |
| // set pending exception |
| __ verify_oop(r0); |
| |
| __ str(r0, Address(rthread, Thread::pending_exception_offset())); |
| __ mov(rscratch1, (address)__FILE__); |
| __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); |
| __ movw(rscratch1, (int)__LINE__); |
| __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); |
| |
| // complete return to VM |
| assert(StubRoutines::_call_stub_return_address != nullptr, |
| "_call_stub_return_address must have been generated before"); |
| __ b(StubRoutines::_call_stub_return_address); |
| |
| return start; |
| } |
| |
| // Continuation point for runtime calls returning with a pending |
| // exception. The pending exception check happened in the runtime |
| // or native call stub. The pending exception in Thread is |
| // converted into a Java-level exception. |
| // |
| // Contract with Java-level exception handlers: |
| // r0: exception |
| // r3: throwing pc |
| // |
| // NOTE: At entry of this stub, exception-pc must be in LR !! |
| |
| // NOTE: this is always used as a jump target within generated code |
| // so it just needs to be generated code with no x86 prolog |
| |
| address generate_forward_exception() { |
| StubCodeMark mark(this, "StubRoutines", "forward exception"); |
| address start = __ pc(); |
| |
| // Upon entry, LR points to the return address returning into |
| // Java (interpreted or compiled) code; i.e., the return address |
| // becomes the throwing pc. |
| // |
| // Arguments pushed before the runtime call are still on the stack |
| // but the exception handler will reset the stack pointer -> |
| // ignore them. A potential result in registers can be ignored as |
| // well. |
| |
| #ifdef ASSERT |
| // make sure this code is only executed if there is a pending exception |
| { |
| Label L; |
| __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); |
| __ cbnz(rscratch1, L); |
| __ stop("StubRoutines::forward exception: no pending exception (1)"); |
| __ bind(L); |
| } |
| #endif |
| |
| // compute exception handler into r19 |
| |
| // call the VM to find the handler address associated with the |
| // caller address. pass thread in r0 and caller pc (ret address) |
| // in r1. n.b. the caller pc is in lr, unlike x86 where it is on |
| // the stack. |
| __ mov(c_rarg1, lr); |
| // lr will be trashed by the VM call so we move it to R19 |
| // (callee-saved) because we also need to pass it to the handler |
| // returned by this call. |
| __ mov(r19, lr); |
| BLOCK_COMMENT("call exception_handler_for_return_address"); |
| __ call_VM_leaf(CAST_FROM_FN_PTR(address, |
| SharedRuntime::exception_handler_for_return_address), |
| rthread, c_rarg1); |
| // Reinitialize the ptrue predicate register, in case the external runtime |
| // call clobbers ptrue reg, as we may return to SVE compiled code. |
| __ reinitialize_ptrue(); |
| |
| // we should not really care that lr is no longer the callee |
| // address. we saved the value the handler needs in r19 so we can |
| // just copy it to r3. however, the C2 handler will push its own |
| // frame and then calls into the VM and the VM code asserts that |
| // the PC for the frame above the handler belongs to a compiled |
| // Java method. So, we restore lr here to satisfy that assert. |
| __ mov(lr, r19); |
| // setup r0 & r3 & clear pending exception |
| __ mov(r3, r19); |
| __ mov(r19, r0); |
| __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); |
| __ str(zr, Address(rthread, Thread::pending_exception_offset())); |
| |
| #ifdef ASSERT |
| // make sure exception is set |
| { |
| Label L; |
| __ cbnz(r0, L); |
| __ stop("StubRoutines::forward exception: no pending exception (2)"); |
| __ bind(L); |
| } |
| #endif |
| |
| // continue at exception handler |
| // r0: exception |
| // r3: throwing pc |
| // r19: exception handler |
| __ verify_oop(r0); |
| __ br(r19); |
| |
| return start; |
| } |
| |
| // Non-destructive plausibility checks for oops |
| // |
| // Arguments: |
| // r0: oop to verify |
| // rscratch1: error message |
| // |
| // Stack after saving c_rarg3: |
| // [tos + 0]: saved c_rarg3 |
| // [tos + 1]: saved c_rarg2 |
| // [tos + 2]: saved lr |
| // [tos + 3]: saved rscratch2 |
| // [tos + 4]: saved r0 |
| // [tos + 5]: saved rscratch1 |
| address generate_verify_oop() { |
| |
| StubCodeMark mark(this, "StubRoutines", "verify_oop"); |
| address start = __ pc(); |
| |
| Label exit, error; |
| |
| // save c_rarg2 and c_rarg3 |
| __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); |
| |
| // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); |
| __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); |
| __ ldr(c_rarg3, Address(c_rarg2)); |
| __ add(c_rarg3, c_rarg3, 1); |
| __ str(c_rarg3, Address(c_rarg2)); |
| |
| // object is in r0 |
| // make sure object is 'reasonable' |
| __ cbz(r0, exit); // if obj is null it is OK |
| |
| BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); |
| bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); |
| |
| // return if everything seems ok |
| __ bind(exit); |
| |
| __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); |
| __ ret(lr); |
| |
| // handle errors |
| __ bind(error); |
| __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); |
| |
| __ push(RegSet::range(r0, r29), sp); |
| // debug(char* msg, int64_t pc, int64_t regs[]) |
| __ mov(c_rarg0, rscratch1); // pass address of error message |
| __ mov(c_rarg1, lr); // pass return address |
| __ mov(c_rarg2, sp); // pass address of regs on stack |
| #ifndef PRODUCT |
| assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); |
| #endif |
| BLOCK_COMMENT("call MacroAssembler::debug"); |
| __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); |
| __ blr(rscratch1); |
| __ hlt(0); |
| |
| return start; |
| } |
| |
| // Generate indices for iota vector. |
| address generate_iota_indices(const char *stub_name) { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", stub_name); |
| address start = __ pc(); |
| // B |
| __ emit_data64(0x0706050403020100, relocInfo::none); |
| __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); |
| // H |
| __ emit_data64(0x0003000200010000, relocInfo::none); |
| __ emit_data64(0x0007000600050004, relocInfo::none); |
| // S |
| __ emit_data64(0x0000000100000000, relocInfo::none); |
| __ emit_data64(0x0000000300000002, relocInfo::none); |
| // D |
| __ emit_data64(0x0000000000000000, relocInfo::none); |
| __ emit_data64(0x0000000000000001, relocInfo::none); |
| // S - FP |
| __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f |
| __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f |
| // D - FP |
| __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d |
| __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d |
| return start; |
| } |
| |
| // The inner part of zero_words(). This is the bulk operation, |
| // zeroing words in blocks, possibly using DC ZVA to do it. The |
| // caller is responsible for zeroing the last few words. |
| // |
| // Inputs: |
| // r10: the HeapWord-aligned base address of an array to zero. |
| // r11: the count in HeapWords, r11 > 0. |
| // |
| // Returns r10 and r11, adjusted for the caller to clear. |
| // r10: the base address of the tail of words left to clear. |
| // r11: the number of words in the tail. |
| // r11 < MacroAssembler::zero_words_block_size. |
| |
| address generate_zero_blocks() { |
| Label done; |
| Label base_aligned; |
| |
| Register base = r10, cnt = r11; |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "zero_blocks"); |
| address start = __ pc(); |
| |
| if (UseBlockZeroing) { |
| int zva_length = VM_Version::zva_length(); |
| |
| // Ensure ZVA length can be divided by 16. This is required by |
| // the subsequent operations. |
| assert (zva_length % 16 == 0, "Unexpected ZVA Length"); |
| |
| __ tbz(base, 3, base_aligned); |
| __ str(zr, Address(__ post(base, 8))); |
| __ sub(cnt, cnt, 1); |
| __ bind(base_aligned); |
| |
| // Ensure count >= zva_length * 2 so that it still deserves a zva after |
| // alignment. |
| Label small; |
| int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); |
| __ subs(rscratch1, cnt, low_limit >> 3); |
| __ br(Assembler::LT, small); |
| __ zero_dcache_blocks(base, cnt); |
| __ bind(small); |
| } |
| |
| { |
| // Number of stp instructions we'll unroll |
| const int unroll = |
| MacroAssembler::zero_words_block_size / 2; |
| // Clear the remaining blocks. |
| Label loop; |
| __ subs(cnt, cnt, unroll * 2); |
| __ br(Assembler::LT, done); |
| __ bind(loop); |
| for (int i = 0; i < unroll; i++) |
| __ stp(zr, zr, __ post(base, 16)); |
| __ subs(cnt, cnt, unroll * 2); |
| __ br(Assembler::GE, loop); |
| __ bind(done); |
| __ add(cnt, cnt, unroll * 2); |
| } |
| |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| |
| typedef enum { |
| copy_forwards = 1, |
| copy_backwards = -1 |
| } copy_direction; |
| |
| // Helper object to reduce noise when telling the GC barriers how to perform loads and stores |
| // for arraycopy stubs. |
| class ArrayCopyBarrierSetHelper : StackObj { |
| BarrierSetAssembler* _bs_asm; |
| MacroAssembler* _masm; |
| DecoratorSet _decorators; |
| BasicType _type; |
| Register _gct1; |
| Register _gct2; |
| Register _gct3; |
| FloatRegister _gcvt1; |
| FloatRegister _gcvt2; |
| FloatRegister _gcvt3; |
| |
| public: |
| ArrayCopyBarrierSetHelper(MacroAssembler* masm, |
| DecoratorSet decorators, |
| BasicType type, |
| Register gct1, |
| Register gct2, |
| Register gct3, |
| FloatRegister gcvt1, |
| FloatRegister gcvt2, |
| FloatRegister gcvt3) |
| : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), |
| _masm(masm), |
| _decorators(decorators), |
| _type(type), |
| _gct1(gct1), |
| _gct2(gct2), |
| _gct3(gct3), |
| _gcvt1(gcvt1), |
| _gcvt2(gcvt2), |
| _gcvt3(gcvt3) { |
| } |
| |
| void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { |
| _bs_asm->copy_load_at(_masm, _decorators, _type, 32, |
| dst1, dst2, src, |
| _gct1, _gct2, _gcvt1); |
| } |
| |
| void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { |
| _bs_asm->copy_store_at(_masm, _decorators, _type, 32, |
| dst, src1, src2, |
| _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); |
| } |
| |
| void copy_load_at_16(Register dst1, Register dst2, Address src) { |
| _bs_asm->copy_load_at(_masm, _decorators, _type, 16, |
| dst1, dst2, src, |
| _gct1); |
| } |
| |
| void copy_store_at_16(Address dst, Register src1, Register src2) { |
| _bs_asm->copy_store_at(_masm, _decorators, _type, 16, |
| dst, src1, src2, |
| _gct1, _gct2, _gct3); |
| } |
| |
| void copy_load_at_8(Register dst, Address src) { |
| _bs_asm->copy_load_at(_masm, _decorators, _type, 8, |
| dst, noreg, src, |
| _gct1); |
| } |
| |
| void copy_store_at_8(Address dst, Register src) { |
| _bs_asm->copy_store_at(_masm, _decorators, _type, 8, |
| dst, src, noreg, |
| _gct1, _gct2, _gct3); |
| } |
| }; |
| |
| // Bulk copy of blocks of 8 words. |
| // |
| // count is a count of words. |
| // |
| // Precondition: count >= 8 |
| // |
| // Postconditions: |
| // |
| // The least significant bit of count contains the remaining count |
| // of words to copy. The rest of count is trash. |
| // |
| // s and d are adjusted to point to the remaining words to copy |
| // |
| void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, |
| copy_direction direction) { |
| int unit = wordSize * direction; |
| int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; |
| |
| const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, |
| t4 = r7, t5 = r11, t6 = r12, t7 = r13; |
| const Register stride = r14; |
| const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; |
| const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved |
| ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); |
| |
| assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); |
| assert_different_registers(s, d, count, rscratch1, rscratch2); |
| |
| Label again, drain; |
| const char *stub_name; |
| if (direction == copy_forwards) |
| stub_name = "forward_copy_longs"; |
| else |
| stub_name = "backward_copy_longs"; |
| |
| __ align(CodeEntryAlignment); |
| |
| StubCodeMark mark(this, "StubRoutines", stub_name); |
| |
| __ bind(start); |
| |
| Label unaligned_copy_long; |
| if (AvoidUnalignedAccesses) { |
| __ tbnz(d, 3, unaligned_copy_long); |
| } |
| |
| if (direction == copy_forwards) { |
| __ sub(s, s, bias); |
| __ sub(d, d, bias); |
| } |
| |
| #ifdef ASSERT |
| // Make sure we are never given < 8 words |
| { |
| Label L; |
| __ cmp(count, (u1)8); |
| __ br(Assembler::GE, L); |
| __ stop("genrate_copy_longs called with < 8 words"); |
| __ bind(L); |
| } |
| #endif |
| |
| // Fill 8 registers |
| if (UseSIMDForMemoryOps) { |
| bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); |
| bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); |
| } else { |
| bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); |
| bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); |
| bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); |
| bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); |
| } |
| |
| __ subs(count, count, 16); |
| __ br(Assembler::LO, drain); |
| |
| int prefetch = PrefetchCopyIntervalInBytes; |
| bool use_stride = false; |
| if (direction == copy_backwards) { |
| use_stride = prefetch > 256; |
| prefetch = -prefetch; |
| if (use_stride) __ mov(stride, prefetch); |
| } |
| |
| __ bind(again); |
| |
| if (PrefetchCopyIntervalInBytes > 0) |
| __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); |
| |
| if (UseSIMDForMemoryOps) { |
| bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); |
| bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); |
| bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); |
| bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); |
| } else { |
| bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); |
| bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); |
| bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); |
| bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); |
| bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); |
| bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); |
| bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); |
| bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); |
| } |
| |
| __ subs(count, count, 8); |
| __ br(Assembler::HS, again); |
| |
| // Drain |
| __ bind(drain); |
| if (UseSIMDForMemoryOps) { |
| bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); |
| bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); |
| } else { |
| bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); |
| bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); |
| bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); |
| bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); |
| } |
| |
| { |
| Label L1, L2; |
| __ tbz(count, exact_log2(4), L1); |
| if (UseSIMDForMemoryOps) { |
| bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); |
| bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); |
| } else { |
| bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); |
| bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); |
| bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); |
| bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); |
| } |
| __ bind(L1); |
| |
| if (direction == copy_forwards) { |
| __ add(s, s, bias); |
| __ add(d, d, bias); |
| } |
| |
| __ tbz(count, 1, L2); |
| bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); |
| bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); |
| __ bind(L2); |
| } |
| |
| __ ret(lr); |
| |
| if (AvoidUnalignedAccesses) { |
| Label drain, again; |
| // Register order for storing. Order is different for backward copy. |
| |
| __ bind(unaligned_copy_long); |
| |
| // source address is even aligned, target odd aligned |
| // |
| // when forward copying word pairs we read long pairs at offsets |
| // {0, 2, 4, 6} (in long words). when backwards copying we read |
| // long pairs at offsets {-2, -4, -6, -8}. We adjust the source |
| // address by -2 in the forwards case so we can compute the |
| // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 |
| // or -1. |
| // |
| // when forward copying we need to store 1 word, 3 pairs and |
| // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a |
| // zero offset We adjust the destination by -1 which means we |
| // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. |
| // |
| // When backwards copyng we need to store 1 word, 3 pairs and |
| // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use |
| // offsets {1, 3, 5, 7, 8} * unit. |
| |
| if (direction == copy_forwards) { |
| __ sub(s, s, 16); |
| __ sub(d, d, 8); |
| } |
| |
| // Fill 8 registers |
| // |
| // for forwards copy s was offset by -16 from the original input |
| // value of s so the register contents are at these offsets |
| // relative to the 64 bit block addressed by that original input |
| // and so on for each successive 64 byte block when s is updated |
| // |
| // t0 at offset 0, t1 at offset 8 |
| // t2 at offset 16, t3 at offset 24 |
| // t4 at offset 32, t5 at offset 40 |
| // t6 at offset 48, t7 at offset 56 |
| |
| // for backwards copy s was not offset so the register contents |
| // are at these offsets into the preceding 64 byte block |
| // relative to that original input and so on for each successive |
| // preceding 64 byte block when s is updated. this explains the |
| // slightly counter-intuitive looking pattern of register usage |
| // in the stp instructions for backwards copy. |
| // |
| // t0 at offset -16, t1 at offset -8 |
| // t2 at offset -32, t3 at offset -24 |
| // t4 at offset -48, t5 at offset -40 |
| // t6 at offset -64, t7 at offset -56 |
| |
| bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); |
| bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); |
| bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); |
| bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); |
| |
| __ subs(count, count, 16); |
| __ br(Assembler::LO, drain); |
| |
| int prefetch = PrefetchCopyIntervalInBytes; |
| bool use_stride = false; |
| if (direction == copy_backwards) { |
| use_stride = prefetch > 256; |
| prefetch = -prefetch; |
| if (use_stride) __ mov(stride, prefetch); |
| } |
| |
| __ bind(again); |
| |
| if (PrefetchCopyIntervalInBytes > 0) |
| __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); |
| |
| if (direction == copy_forwards) { |
| // allowing for the offset of -8 the store instructions place |
| // registers into the target 64 bit block at the following |
| // offsets |
| // |
| // t0 at offset 0 |
| // t1 at offset 8, t2 at offset 16 |
| // t3 at offset 24, t4 at offset 32 |
| // t5 at offset 40, t6 at offset 48 |
| // t7 at offset 56 |
| |
| bs.copy_store_at_8(Address(d, 1 * unit), t0); |
| bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); |
| bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); |
| bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); |
| bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); |
| bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); |
| bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); |
| bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); |
| bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); |
| } else { |
| // d was not offset when we started so the registers are |
| // written into the 64 bit block preceding d with the following |
| // offsets |
| // |
| // t1 at offset -8 |
| // t3 at offset -24, t0 at offset -16 |
| // t5 at offset -48, t2 at offset -32 |
| // t7 at offset -56, t4 at offset -48 |
| // t6 at offset -64 |
| // |
| // note that this matches the offsets previously noted for the |
| // loads |
| |
| bs.copy_store_at_8(Address(d, 1 * unit), t1); |
| bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); |
| bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); |
| bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); |
| bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); |
| bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); |
| bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); |
| bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); |
| bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); |
| } |
| |
| __ subs(count, count, 8); |
| __ br(Assembler::HS, again); |
| |
| // Drain |
| // |
| // this uses the same pattern of offsets and register arguments |
| // as above |
| __ bind(drain); |
| if (direction == copy_forwards) { |
| bs.copy_store_at_8(Address(d, 1 * unit), t0); |
| bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); |
| bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); |
| bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); |
| bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); |
| } else { |
| bs.copy_store_at_8(Address(d, 1 * unit), t1); |
| bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); |
| bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); |
| bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); |
| bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); |
| } |
| // now we need to copy any remaining part block which may |
| // include a 4 word block subblock and/or a 2 word subblock. |
| // bits 2 and 1 in the count are the tell-tale for whether we |
| // have each such subblock |
| { |
| Label L1, L2; |
| __ tbz(count, exact_log2(4), L1); |
| // this is the same as above but copying only 4 longs hence |
| // with only one intervening stp between the str instructions |
| // but note that the offsets and registers still follow the |
| // same pattern |
| bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); |
| bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); |
| if (direction == copy_forwards) { |
| bs.copy_store_at_8(Address(d, 1 * unit), t0); |
| bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); |
| bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); |
| } else { |
| bs.copy_store_at_8(Address(d, 1 * unit), t1); |
| bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); |
| bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); |
| } |
| __ bind(L1); |
| |
| __ tbz(count, 1, L2); |
| // this is the same as above but copying only 2 longs hence |
| // there is no intervening stp between the str instructions |
| // but note that the offset and register patterns are still |
| // the same |
| bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); |
| if (direction == copy_forwards) { |
| bs.copy_store_at_8(Address(d, 1 * unit), t0); |
| bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); |
| } else { |
| bs.copy_store_at_8(Address(d, 1 * unit), t1); |
| bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); |
| } |
| __ bind(L2); |
| |
| // for forwards copy we need to re-adjust the offsets we |
| // applied so that s and d are follow the last words written |
| |
| if (direction == copy_forwards) { |
| __ add(s, s, 16); |
| __ add(d, d, 8); |
| } |
| |
| } |
| |
| __ ret(lr); |
| } |
| } |
| |
| // Small copy: less than 16 bytes. |
| // |
| // NB: Ignores all of the bits of count which represent more than 15 |
| // bytes, so a caller doesn't have to mask them. |
| |
| void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { |
| bool is_backwards = step < 0; |
| size_t granularity = uabs(step); |
| int direction = is_backwards ? -1 : 1; |
| |
| Label Lword, Lint, Lshort, Lbyte; |
| |
| assert(granularity |
| && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); |
| |
| const Register t0 = r3; |
| const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; |
| ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); |
| |
| // ??? I don't know if this bit-test-and-branch is the right thing |
| // to do. It does a lot of jumping, resulting in several |
| // mispredicted branches. It might make more sense to do this |
| // with something like Duff's device with a single computed branch. |
| |
| __ tbz(count, 3 - exact_log2(granularity), Lword); |
| bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); |
| bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); |
| __ bind(Lword); |
| |
| if (granularity <= sizeof (jint)) { |
| __ tbz(count, 2 - exact_log2(granularity), Lint); |
| __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); |
| __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); |
| __ bind(Lint); |
| } |
| |
| if (granularity <= sizeof (jshort)) { |
| __ tbz(count, 1 - exact_log2(granularity), Lshort); |
| __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); |
| __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); |
| __ bind(Lshort); |
| } |
| |
| if (granularity <= sizeof (jbyte)) { |
| __ tbz(count, 0, Lbyte); |
| __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); |
| __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); |
| __ bind(Lbyte); |
| } |
| } |
| |
| Label copy_f, copy_b; |
| Label copy_obj_f, copy_obj_b; |
| Label copy_obj_uninit_f, copy_obj_uninit_b; |
| |
| // All-singing all-dancing memory copy. |
| // |
| // Copy count units of memory from s to d. The size of a unit is |
| // step, which can be positive or negative depending on the direction |
| // of copy. If is_aligned is false, we align the source address. |
| // |
| |
| void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, |
| Register s, Register d, Register count, int step) { |
| copy_direction direction = step < 0 ? copy_backwards : copy_forwards; |
| bool is_backwards = step < 0; |
| unsigned int granularity = uabs(step); |
| const Register t0 = r3, t1 = r4; |
| |
| // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always |
| // load all the data before writing anything |
| Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; |
| const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; |
| const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; |
| const Register send = r17, dend = r16; |
| const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; |
| const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved |
| ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); |
| |
| if (PrefetchCopyIntervalInBytes > 0) |
| __ prfm(Address(s, 0), PLDL1KEEP); |
| __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); |
| __ br(Assembler::HI, copy_big); |
| |
| __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); |
| __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); |
| |
| __ cmp(count, u1(16/granularity)); |
| __ br(Assembler::LS, copy16); |
| |
| __ cmp(count, u1(64/granularity)); |
| __ br(Assembler::HI, copy80); |
| |
| __ cmp(count, u1(32/granularity)); |
| __ br(Assembler::LS, copy32); |
| |
| // 33..64 bytes |
| if (UseSIMDForMemoryOps) { |
| bs.copy_load_at_32(v0, v1, Address(s, 0)); |
| bs.copy_load_at_32(v2, v3, Address(send, -32)); |
| bs.copy_store_at_32(Address(d, 0), v0, v1); |
| bs.copy_store_at_32(Address(dend, -32), v2, v3); |
| } else { |
| bs.copy_load_at_16(t0, t1, Address(s, 0)); |
| bs.copy_load_at_16(t2, t3, Address(s, 16)); |
| bs.copy_load_at_16(t4, t5, Address(send, -32)); |
| bs.copy_load_at_16(t6, t7, Address(send, -16)); |
| |
| bs.copy_store_at_16(Address(d, 0), t0, t1); |
| bs.copy_store_at_16(Address(d, 16), t2, t3); |
| bs.copy_store_at_16(Address(dend, -32), t4, t5); |
| bs.copy_store_at_16(Address(dend, -16), t6, t7); |
| } |
| __ b(finish); |
| |
| // 17..32 bytes |
| __ bind(copy32); |
| bs.copy_load_at_16(t0, t1, Address(s, 0)); |
| bs.copy_load_at_16(t6, t7, Address(send, -16)); |
| |
| bs.copy_store_at_16(Address(d, 0), t0, t1); |
| bs.copy_store_at_16(Address(dend, -16), t6, t7); |
| __ b(finish); |
| |
| // 65..80/96 bytes |
| // (96 bytes if SIMD because we do 32 byes per instruction) |
| __ bind(copy80); |
| if (UseSIMDForMemoryOps) { |
| bs.copy_load_at_32(v0, v1, Address(s, 0)); |
| bs.copy_load_at_32(v2, v3, Address(s, 32)); |
| // Unaligned pointers can be an issue for copying. |
| // The issue has more chances to happen when granularity of data is |
| // less than 4(sizeof(jint)). Pointers for arrays of jint are at least |
| // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. |
| // The most performance drop has been seen for the range 65-80 bytes. |
| // For such cases using the pair of ldp/stp instead of the third pair of |
| // ldpq/stpq fixes the performance issue. |
| if (granularity < sizeof (jint)) { |
| Label copy96; |
| __ cmp(count, u1(80/granularity)); |
| __ br(Assembler::HI, copy96); |
| bs.copy_load_at_16(t0, t1, Address(send, -16)); |
| |
| bs.copy_store_at_32(Address(d, 0), v0, v1); |
| bs.copy_store_at_32(Address(d, 32), v2, v3); |
| |
| bs.copy_store_at_16(Address(dend, -16), t0, t1); |
| __ b(finish); |
| |
| __ bind(copy96); |
| } |
| bs.copy_load_at_32(v4, v5, Address(send, -32)); |
| |
| bs.copy_store_at_32(Address(d, 0), v0, v1); |
| bs.copy_store_at_32(Address(d, 32), v2, v3); |
| |
| bs.copy_store_at_32(Address(dend, -32), v4, v5); |
| } else { |
| bs.copy_load_at_16(t0, t1, Address(s, 0)); |
| bs.copy_load_at_16(t2, t3, Address(s, 16)); |
| bs.copy_load_at_16(t4, t5, Address(s, 32)); |
| bs.copy_load_at_16(t6, t7, Address(s, 48)); |
| bs.copy_load_at_16(t8, t9, Address(send, -16)); |
| |
| bs.copy_store_at_16(Address(d, 0), t0, t1); |
| bs.copy_store_at_16(Address(d, 16), t2, t3); |
| bs.copy_store_at_16(Address(d, 32), t4, t5); |
| bs.copy_store_at_16(Address(d, 48), t6, t7); |
| bs.copy_store_at_16(Address(dend, -16), t8, t9); |
| } |
| __ b(finish); |
| |
| // 0..16 bytes |
| __ bind(copy16); |
| __ cmp(count, u1(8/granularity)); |
| __ br(Assembler::LO, copy8); |
| |
| // 8..16 bytes |
| bs.copy_load_at_8(t0, Address(s, 0)); |
| bs.copy_load_at_8(t1, Address(send, -8)); |
| bs.copy_store_at_8(Address(d, 0), t0); |
| bs.copy_store_at_8(Address(dend, -8), t1); |
| __ b(finish); |
| |
| if (granularity < 8) { |
| // 4..7 bytes |
| __ bind(copy8); |
| __ tbz(count, 2 - exact_log2(granularity), copy4); |
| __ ldrw(t0, Address(s, 0)); |
| __ ldrw(t1, Address(send, -4)); |
| __ strw(t0, Address(d, 0)); |
| __ strw(t1, Address(dend, -4)); |
| __ b(finish); |
| if (granularity < 4) { |
| // 0..3 bytes |
| __ bind(copy4); |
| __ cbz(count, finish); // get rid of 0 case |
| if (granularity == 2) { |
| __ ldrh(t0, Address(s, 0)); |
| __ strh(t0, Address(d, 0)); |
| } else { // granularity == 1 |
| // Now 1..3 bytes. Handle the 1 and 2 byte case by copying |
| // the first and last byte. |
| // Handle the 3 byte case by loading and storing base + count/2 |
| // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) |
| // This does means in the 1 byte case we load/store the same |
| // byte 3 times. |
| __ lsr(count, count, 1); |
| __ ldrb(t0, Address(s, 0)); |
| __ ldrb(t1, Address(send, -1)); |
| __ ldrb(t2, Address(s, count)); |
| __ strb(t0, Address(d, 0)); |
| __ strb(t1, Address(dend, -1)); |
| __ strb(t2, Address(d, count)); |
| } |
| __ b(finish); |
| } |
| } |
| |
| __ bind(copy_big); |
| if (is_backwards) { |
| __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); |
| __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); |
| } |
| |
| // Now we've got the small case out of the way we can align the |
| // source address on a 2-word boundary. |
| |
| // Here we will materialize a count in r15, which is used by copy_memory_small |
| // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. |
| // Up until here, we have used t9, which aliases r15, but from here on, that register |
| // can not be used as a temp register, as it contains the count. |
| |
| Label aligned; |
| |
| if (is_aligned) { |
| // We may have to adjust by 1 word to get s 2-word-aligned. |
| __ tbz(s, exact_log2(wordSize), aligned); |
| bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); |
| bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); |
| __ sub(count, count, wordSize/granularity); |
| } else { |
| if (is_backwards) { |
| __ andr(r15, s, 2 * wordSize - 1); |
| } else { |
| __ neg(r15, s); |
| __ andr(r15, r15, 2 * wordSize - 1); |
| } |
| // r15 is the byte adjustment needed to align s. |
| __ cbz(r15, aligned); |
| int shift = exact_log2(granularity); |
| if (shift) __ lsr(r15, r15, shift); |
| __ sub(count, count, r15); |
| |
| #if 0 |
| // ?? This code is only correct for a disjoint copy. It may or |
| // may not make sense to use it in that case. |
| |
| // Copy the first pair; s and d may not be aligned. |
| __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); |
| __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); |
| |
| // Align s and d, adjust count |
| if (is_backwards) { |
| __ sub(s, s, r15); |
| __ sub(d, d, r15); |
| } else { |
| __ add(s, s, r15); |
| __ add(d, d, r15); |
| } |
| #else |
| copy_memory_small(decorators, type, s, d, r15, step); |
| #endif |
| } |
| |
| __ bind(aligned); |
| |
| // s is now 2-word-aligned. |
| |
| // We have a count of units and some trailing bytes. Adjust the |
| // count and do a bulk copy of words. |
| __ lsr(r15, count, exact_log2(wordSize/granularity)); |
| if (direction == copy_forwards) { |
| if (type != T_OBJECT) { |
| __ bl(copy_f); |
| } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { |
| __ bl(copy_obj_uninit_f); |
| } else { |
| __ bl(copy_obj_f); |
| } |
| } else { |
| if (type != T_OBJECT) { |
| __ bl(copy_b); |
| } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { |
| __ bl(copy_obj_uninit_b); |
| } else { |
| __ bl(copy_obj_b); |
| } |
| } |
| |
| // And the tail. |
| copy_memory_small(decorators, type, s, d, count, step); |
| |
| if (granularity >= 8) __ bind(copy8); |
| if (granularity >= 4) __ bind(copy4); |
| __ bind(finish); |
| } |
| |
| |
| void clobber_registers() { |
| #ifdef ASSERT |
| RegSet clobbered |
| = MacroAssembler::call_clobbered_gp_registers() - rscratch1; |
| __ mov(rscratch1, (uint64_t)0xdeadbeef); |
| __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); |
| for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { |
| __ mov(*it, rscratch1); |
| } |
| #endif |
| |
| } |
| |
| // Scan over array at a for count oops, verifying each one. |
| // Preserves a and count, clobbers rscratch1 and rscratch2. |
| void verify_oop_array (int size, Register a, Register count, Register temp) { |
| Label loop, end; |
| __ mov(rscratch1, a); |
| __ mov(rscratch2, zr); |
| __ bind(loop); |
| __ cmp(rscratch2, count); |
| __ br(Assembler::HS, end); |
| if (size == wordSize) { |
| __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); |
| __ verify_oop(temp); |
| } else { |
| __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); |
| __ decode_heap_oop(temp); // calls verify_oop |
| } |
| __ add(rscratch2, rscratch2, 1); |
| __ b(loop); |
| __ bind(end); |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
| // ignored |
| // is_oop - true => oop array, so generate store check code |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as ssize_t, can be zero |
| // |
| // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let |
| // the hardware handle it. The two dwords within qwords that span |
| // cache line boundaries will still be loaded and stored atomically. |
| // |
| // Side Effects: |
| // disjoint_int_copy_entry is set to the no-overlap entry point |
| // used by generate_conjoint_int_oop_copy(). |
| // |
| address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, |
| const char *name, bool dest_uninitialized = false) { |
| Register s = c_rarg0, d = c_rarg1, count = c_rarg2; |
| RegSet saved_reg = RegSet::of(s, d, count); |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| __ enter(); |
| |
| if (entry != nullptr) { |
| *entry = __ pc(); |
| // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
| BLOCK_COMMENT("Entry:"); |
| } |
| |
| DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; |
| if (dest_uninitialized) { |
| decorators |= IS_DEST_UNINITIALIZED; |
| } |
| if (aligned) { |
| decorators |= ARRAYCOPY_ALIGNED; |
| } |
| |
| BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
| bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); |
| |
| if (is_oop) { |
| // save regs before copy_memory |
| __ push(RegSet::of(d, count), sp); |
| } |
| { |
| // UnsafeCopyMemory page error: continue after ucm |
| bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); |
| UnsafeCopyMemoryMark ucmm(this, add_entry, true); |
| copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); |
| } |
| |
| if (is_oop) { |
| __ pop(RegSet::of(d, count), sp); |
| if (VerifyOops) |
| verify_oop_array(size, d, count, r16); |
| } |
| |
| bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); |
| |
| __ leave(); |
| __ mov(r0, zr); // return 0 |
| __ ret(lr); |
| return start; |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
| // ignored |
| // is_oop - true => oop array, so generate store check code |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as ssize_t, can be zero |
| // |
| // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let |
| // the hardware handle it. The two dwords within qwords that span |
| // cache line boundaries will still be loaded and stored atomically. |
| // |
| address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, |
| address *entry, const char *name, |
| bool dest_uninitialized = false) { |
| Register s = c_rarg0, d = c_rarg1, count = c_rarg2; |
| RegSet saved_regs = RegSet::of(s, d, count); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| __ enter(); |
| |
| if (entry != nullptr) { |
| *entry = __ pc(); |
| // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) |
| BLOCK_COMMENT("Entry:"); |
| } |
| |
| // use fwd copy when (d-s) above_equal (count*size) |
| __ sub(rscratch1, d, s); |
| __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); |
| __ br(Assembler::HS, nooverlap_target); |
| |
| DecoratorSet decorators = IN_HEAP | IS_ARRAY; |
| if (dest_uninitialized) { |
| decorators |= IS_DEST_UNINITIALIZED; |
| } |
| if (aligned) { |
| decorators |= ARRAYCOPY_ALIGNED; |
| } |
| |
| BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
| bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); |
| |
| if (is_oop) { |
| // save regs before copy_memory |
| __ push(RegSet::of(d, count), sp); |
| } |
| { |
| // UnsafeCopyMemory page error: continue after ucm |
| bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); |
| UnsafeCopyMemoryMark ucmm(this, add_entry, true); |
| copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); |
| } |
| if (is_oop) { |
| __ pop(RegSet::of(d, count), sp); |
| if (VerifyOops) |
| verify_oop_array(size, d, count, r16); |
| } |
| bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); |
| __ leave(); |
| __ mov(r0, zr); // return 0 |
| __ ret(lr); |
| return start; |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as ssize_t, can be zero |
| // |
| // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, |
| // we let the hardware handle it. The one to eight bytes within words, |
| // dwords or qwords that span cache line boundaries will still be loaded |
| // and stored atomically. |
| // |
| // Side Effects: |
| // disjoint_byte_copy_entry is set to the no-overlap entry point // |
| // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, |
| // we let the hardware handle it. The one to eight bytes within words, |
| // dwords or qwords that span cache line boundaries will still be loaded |
| // and stored atomically. |
| // |
| // Side Effects: |
| // disjoint_byte_copy_entry is set to the no-overlap entry point |
| // used by generate_conjoint_byte_copy(). |
| // |
| address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { |
| const bool not_oop = false; |
| return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as ssize_t, can be zero |
| // |
| // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, |
| // we let the hardware handle it. The one to eight bytes within words, |
| // dwords or qwords that span cache line boundaries will still be loaded |
| // and stored atomically. |
| // |
| address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, |
| address* entry, const char *name) { |
| const bool not_oop = false; |
| return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as ssize_t, can be zero |
| // |
| // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we |
| // let the hardware handle it. The two or four words within dwords |
| // or qwords that span cache line boundaries will still be loaded |
| // and stored atomically. |
| // |
| // Side Effects: |
| // disjoint_short_copy_entry is set to the no-overlap entry point |
| // used by generate_conjoint_short_copy(). |
| // |
| address generate_disjoint_short_copy(bool aligned, |
| address* entry, const char *name) { |
| const bool not_oop = false; |
| return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as ssize_t, can be zero |
| // |
| // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we |
| // let the hardware handle it. The two or four words within dwords |
| // or qwords that span cache line boundaries will still be loaded |
| // and stored atomically. |
| // |
| address generate_conjoint_short_copy(bool aligned, address nooverlap_target, |
| address *entry, const char *name) { |
| const bool not_oop = false; |
| return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); |
| |
| } |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as ssize_t, can be zero |
| // |
| // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let |
| // the hardware handle it. The two dwords within qwords that span |
| // cache line boundaries will still be loaded and stored atomically. |
| // |
| // Side Effects: |
| // disjoint_int_copy_entry is set to the no-overlap entry point |
| // used by generate_conjoint_int_oop_copy(). |
| // |
| address generate_disjoint_int_copy(bool aligned, address *entry, |
| const char *name, bool dest_uninitialized = false) { |
| const bool not_oop = false; |
| return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as ssize_t, can be zero |
| // |
| // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let |
| // the hardware handle it. The two dwords within qwords that span |
| // cache line boundaries will still be loaded and stored atomically. |
| // |
| address generate_conjoint_int_copy(bool aligned, address nooverlap_target, |
| address *entry, const char *name, |
| bool dest_uninitialized = false) { |
| const bool not_oop = false; |
| return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); |
| } |
| |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as size_t, can be zero |
| // |
| // Side Effects: |
| // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the |
| // no-overlap entry point used by generate_conjoint_long_oop_copy(). |
| // |
| address generate_disjoint_long_copy(bool aligned, address *entry, |
| const char *name, bool dest_uninitialized = false) { |
| const bool not_oop = false; |
| return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as size_t, can be zero |
| // |
| address generate_conjoint_long_copy(bool aligned, |
| address nooverlap_target, address *entry, |
| const char *name, bool dest_uninitialized = false) { |
| const bool not_oop = false; |
| return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as size_t, can be zero |
| // |
| // Side Effects: |
| // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the |
| // no-overlap entry point used by generate_conjoint_long_oop_copy(). |
| // |
| address generate_disjoint_oop_copy(bool aligned, address *entry, |
| const char *name, bool dest_uninitialized) { |
| const bool is_oop = true; |
| const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); |
| return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); |
| } |
| |
| // Arguments: |
| // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes |
| // ignored |
| // name - stub name string |
| // |
| // Inputs: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as size_t, can be zero |
| // |
| address generate_conjoint_oop_copy(bool aligned, |
| address nooverlap_target, address *entry, |
| const char *name, bool dest_uninitialized) { |
| const bool is_oop = true; |
| const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); |
| return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, |
| name, dest_uninitialized); |
| } |
| |
| |
| // Helper for generating a dynamic type check. |
| // Smashes rscratch1, rscratch2. |
| void generate_type_check(Register sub_klass, |
| Register super_check_offset, |
| Register super_klass, |
| Label& L_success) { |
| assert_different_registers(sub_klass, super_check_offset, super_klass); |
| |
| BLOCK_COMMENT("type_check:"); |
| |
| Label L_miss; |
| |
| __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, |
| super_check_offset); |
| __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); |
| |
| // Fall through on failure! |
| __ BIND(L_miss); |
| } |
| |
| // |
| // Generate checkcasting array copy stub |
| // |
| // Input: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - element count, treated as ssize_t, can be zero |
| // c_rarg3 - size_t ckoff (super_check_offset) |
| // c_rarg4 - oop ckval (super_klass) |
| // |
| // Output: |
| // r0 == 0 - success |
| // r0 == -1^K - failure, where K is partial transfer count |
| // |
| address generate_checkcast_copy(const char *name, address *entry, |
| bool dest_uninitialized = false) { |
| |
| Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; |
| |
| // Input registers (after setup_arg_regs) |
| const Register from = c_rarg0; // source array address |
| const Register to = c_rarg1; // destination array address |
| const Register count = c_rarg2; // elementscount |
| const Register ckoff = c_rarg3; // super_check_offset |
| const Register ckval = c_rarg4; // super_klass |
| |
| RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); |
| RegSet wb_post_saved_regs = RegSet::of(count); |
| |
| // Registers used as temps (r19, r20, r21, r22 are save-on-entry) |
| const Register copied_oop = r22; // actual oop copied |
| const Register count_save = r21; // orig elementscount |
| const Register start_to = r20; // destination array start address |
| const Register r19_klass = r19; // oop._klass |
| |
| // Registers used as gc temps (r5, r6, r7 are save-on-call) |
| const Register gct1 = r5, gct2 = r6, gct3 = r7; |
| |
| //--------------------------------------------------------------- |
| // Assembler stub will be used for this call to arraycopy |
| // if the two arrays are subtypes of Object[] but the |
| // destination array type is not equal to or a supertype |
| // of the source type. Each element must be separately |
| // checked. |
| |
| assert_different_registers(from, to, count, ckoff, ckval, start_to, |
| copied_oop, r19_klass, count_save); |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| |
| __ enter(); // required for proper stackwalking of RuntimeStub frame |
| |
| #ifdef ASSERT |
| // caller guarantees that the arrays really are different |
| // otherwise, we would have to make conjoint checks |
| { Label L; |
| __ b(L); // conjoint check not yet implemented |
| __ stop("checkcast_copy within a single array"); |
| __ bind(L); |
| } |
| #endif //ASSERT |
| |
| // Caller of this entry point must set up the argument registers. |
| if (entry != nullptr) { |
| *entry = __ pc(); |
| BLOCK_COMMENT("Entry:"); |
| } |
| |
| // Empty array: Nothing to do. |
| __ cbz(count, L_done); |
| __ push(RegSet::of(r19, r20, r21, r22), sp); |
| |
| #ifdef ASSERT |
| BLOCK_COMMENT("assert consistent ckoff/ckval"); |
| // The ckoff and ckval must be mutually consistent, |
| // even though caller generates both. |
| { Label L; |
| int sco_offset = in_bytes(Klass::super_check_offset_offset()); |
| __ ldrw(start_to, Address(ckval, sco_offset)); |
| __ cmpw(ckoff, start_to); |
| __ br(Assembler::EQ, L); |
| __ stop("super_check_offset inconsistent"); |
| __ bind(L); |
| } |
| #endif //ASSERT |
| |
| DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; |
| bool is_oop = true; |
| int element_size = UseCompressedOops ? 4 : 8; |
| if (dest_uninitialized) { |
| decorators |= IS_DEST_UNINITIALIZED; |
| } |
| |
| BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); |
| bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); |
| |
| // save the original count |
| __ mov(count_save, count); |
| |
| // Copy from low to high addresses |
| __ mov(start_to, to); // Save destination array start address |
| __ b(L_load_element); |
| |
| // ======== begin loop ======== |
| // (Loop is rotated; its entry is L_load_element.) |
| // Loop control: |
| // for (; count != 0; count--) { |
| // copied_oop = load_heap_oop(from++); |
| // ... generate_type_check ...; |
| // store_heap_oop(to++, copied_oop); |
| // } |
| __ align(OptoLoopAlignment); |
| |
| __ BIND(L_store_element); |
| bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, |
| __ post(to, element_size), copied_oop, noreg, |
| gct1, gct2, gct3); |
| __ sub(count, count, 1); |
| __ cbz(count, L_do_card_marks); |
| |
| // ======== loop entry is here ======== |
| __ BIND(L_load_element); |
| bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, |
| copied_oop, noreg, __ post(from, element_size), |
| gct1); |
| __ cbz(copied_oop, L_store_element); |
| |
| __ load_klass(r19_klass, copied_oop);// query the object klass |
| generate_type_check(r19_klass, ckoff, ckval, L_store_element); |
| // ======== end loop ======== |
| |
| // It was a real error; we must depend on the caller to finish the job. |
| // Register count = remaining oops, count_orig = total oops. |
| // Emit GC store barriers for the oops we have copied and report |
| // their number to the caller. |
| |
| __ subs(count, count_save, count); // K = partially copied oop count |
| __ eon(count, count, zr); // report (-1^K) to caller |
| __ br(Assembler::EQ, L_done_pop); |
| |
| __ BIND(L_do_card_marks); |
| bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); |
| |
| __ bind(L_done_pop); |
| __ pop(RegSet::of(r19, r20, r21, r22), sp); |
| inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); |
| |
| __ bind(L_done); |
| __ mov(r0, count); |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Perform range checks on the proposed arraycopy. |
| // Kills temp, but nothing else. |
| // Also, clean the sign bits of src_pos and dst_pos. |
| void arraycopy_range_checks(Register src, // source array oop (c_rarg0) |
| Register src_pos, // source position (c_rarg1) |
| Register dst, // destination array oo (c_rarg2) |
| Register dst_pos, // destination position (c_rarg3) |
| Register length, |
| Register temp, |
| Label& L_failed) { |
| BLOCK_COMMENT("arraycopy_range_checks:"); |
| |
| assert_different_registers(rscratch1, temp); |
| |
| // if (src_pos + length > arrayOop(src)->length()) FAIL; |
| __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); |
| __ addw(temp, length, src_pos); |
| __ cmpw(temp, rscratch1); |
| __ br(Assembler::HI, L_failed); |
| |
| // if (dst_pos + length > arrayOop(dst)->length()) FAIL; |
| __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); |
| __ addw(temp, length, dst_pos); |
| __ cmpw(temp, rscratch1); |
| __ br(Assembler::HI, L_failed); |
| |
| // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. |
| __ movw(src_pos, src_pos); |
| __ movw(dst_pos, dst_pos); |
| |
| BLOCK_COMMENT("arraycopy_range_checks done"); |
| } |
| |
| // These stubs get called from some dumb test routine. |
| // I'll write them properly when they're called from |
| // something that's actually doing something. |
| static void fake_arraycopy_stub(address src, address dst, int count) { |
| assert(count == 0, "huh?"); |
| } |
| |
| |
| // |
| // Generate 'unsafe' array copy stub |
| // Though just as safe as the other stubs, it takes an unscaled |
| // size_t argument instead of an element count. |
| // |
| // Input: |
| // c_rarg0 - source array address |
| // c_rarg1 - destination array address |
| // c_rarg2 - byte count, treated as ssize_t, can be zero |
| // |
| // Examines the alignment of the operands and dispatches |
| // to a long, int, short, or byte copy loop. |
| // |
| address generate_unsafe_copy(const char *name, |
| address byte_copy_entry, |
| address short_copy_entry, |
| address int_copy_entry, |
| address long_copy_entry) { |
| Label L_long_aligned, L_int_aligned, L_short_aligned; |
| Register s = c_rarg0, d = c_rarg1, count = c_rarg2; |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| __ enter(); // required for proper stackwalking of RuntimeStub frame |
| |
| // bump this on entry, not on exit: |
| inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); |
| |
| __ orr(rscratch1, s, d); |
| __ orr(rscratch1, rscratch1, count); |
| |
| __ andr(rscratch1, rscratch1, BytesPerLong-1); |
| __ cbz(rscratch1, L_long_aligned); |
| __ andr(rscratch1, rscratch1, BytesPerInt-1); |
| __ cbz(rscratch1, L_int_aligned); |
| __ tbz(rscratch1, 0, L_short_aligned); |
| __ b(RuntimeAddress(byte_copy_entry)); |
| |
| __ BIND(L_short_aligned); |
| __ lsr(count, count, LogBytesPerShort); // size => short_count |
| __ b(RuntimeAddress(short_copy_entry)); |
| __ BIND(L_int_aligned); |
| __ lsr(count, count, LogBytesPerInt); // size => int_count |
| __ b(RuntimeAddress(int_copy_entry)); |
| __ BIND(L_long_aligned); |
| __ lsr(count, count, LogBytesPerLong); // size => long_count |
| __ b(RuntimeAddress(long_copy_entry)); |
| |
| return start; |
| } |
| |
| // |
| // Generate generic array copy stubs |
| // |
| // Input: |
| // c_rarg0 - src oop |
| // c_rarg1 - src_pos (32-bits) |
| // c_rarg2 - dst oop |
| // c_rarg3 - dst_pos (32-bits) |
| // c_rarg4 - element count (32-bits) |
| // |
| // Output: |
| // r0 == 0 - success |
| // r0 == -1^K - failure, where K is partial transfer count |
| // |
| address generate_generic_copy(const char *name, |
| address byte_copy_entry, address short_copy_entry, |
| address int_copy_entry, address oop_copy_entry, |
| address long_copy_entry, address checkcast_copy_entry) { |
| |
| Label L_failed, L_objArray; |
| Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; |
| |
| // Input registers |
| const Register src = c_rarg0; // source array oop |
| const Register src_pos = c_rarg1; // source position |
| const Register dst = c_rarg2; // destination array oop |
| const Register dst_pos = c_rarg3; // destination position |
| const Register length = c_rarg4; |
| |
| |
| // Registers used as temps |
| const Register dst_klass = c_rarg5; |
| |
| __ align(CodeEntryAlignment); |
| |
| StubCodeMark mark(this, "StubRoutines", name); |
| |
| address start = __ pc(); |
| |
| __ enter(); // required for proper stackwalking of RuntimeStub frame |
| |
| // bump this on entry, not on exit: |
| inc_counter_np(SharedRuntime::_generic_array_copy_ctr); |
| |
| //----------------------------------------------------------------------- |
| // Assembler stub will be used for this call to arraycopy |
| // if the following conditions are met: |
| // |
| // (1) src and dst must not be null. |
| // (2) src_pos must not be negative. |
| // (3) dst_pos must not be negative. |
| // (4) length must not be negative. |
| // (5) src klass and dst klass should be the same and not null. |
| // (6) src and dst should be arrays. |
| // (7) src_pos + length must not exceed length of src. |
| // (8) dst_pos + length must not exceed length of dst. |
| // |
| |
| // if (src == nullptr) return -1; |
| __ cbz(src, L_failed); |
| |
| // if (src_pos < 0) return -1; |
| __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set |
| |
| // if (dst == nullptr) return -1; |
| __ cbz(dst, L_failed); |
| |
| // if (dst_pos < 0) return -1; |
| __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set |
| |
| // registers used as temp |
| const Register scratch_length = r16; // elements count to copy |
| const Register scratch_src_klass = r17; // array klass |
| const Register lh = r15; // layout helper |
| |
| // if (length < 0) return -1; |
| __ movw(scratch_length, length); // length (elements count, 32-bits value) |
| __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set |
| |
| __ load_klass(scratch_src_klass, src); |
| #ifdef ASSERT |
| // assert(src->klass() != nullptr); |
| { |
| BLOCK_COMMENT("assert klasses not null {"); |
| Label L1, L2; |
| __ cbnz(scratch_src_klass, L2); // it is broken if klass is null |
| __ bind(L1); |
| __ stop("broken null klass"); |
| __ bind(L2); |
| __ load_klass(rscratch1, dst); |
| __ cbz(rscratch1, L1); // this would be broken also |
| BLOCK_COMMENT("} assert klasses not null done"); |
| } |
| #endif |
| |
| // Load layout helper (32-bits) |
| // |
| // |array_tag| | header_size | element_type | |log2_element_size| |
| // 32 30 24 16 8 2 0 |
| // |
| // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 |
| // |
| |
| const int lh_offset = in_bytes(Klass::layout_helper_offset()); |
| |
| // Handle objArrays completely differently... |
| const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); |
| __ ldrw(lh, Address(scratch_src_klass, lh_offset)); |
| __ movw(rscratch1, objArray_lh); |
| __ eorw(rscratch2, lh, rscratch1); |
| __ cbzw(rscratch2, L_objArray); |
| |
| // if (src->klass() != dst->klass()) return -1; |
| __ load_klass(rscratch2, dst); |
| __ eor(rscratch2, rscratch2, scratch_src_klass); |
| __ cbnz(rscratch2, L_failed); |
| |
| // if (!src->is_Array()) return -1; |
| __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) |
| |
| // At this point, it is known to be a typeArray (array_tag 0x3). |
| #ifdef ASSERT |
| { |
| BLOCK_COMMENT("assert primitive array {"); |
| Label L; |
| __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); |
| __ cmpw(lh, rscratch2); |
| __ br(Assembler::GE, L); |
| __ stop("must be a primitive array"); |
| __ bind(L); |
| BLOCK_COMMENT("} assert primitive array done"); |
| } |
| #endif |
| |
| arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, |
| rscratch2, L_failed); |
| |
| // TypeArrayKlass |
| // |
| // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); |
| // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); |
| // |
| |
| const Register rscratch1_offset = rscratch1; // array offset |
| const Register r15_elsize = lh; // element size |
| |
| __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, |
| exact_log2(Klass::_lh_header_size_mask+1)); // array_offset |
| __ add(src, src, rscratch1_offset); // src array offset |
| __ add(dst, dst, rscratch1_offset); // dst array offset |
| BLOCK_COMMENT("choose copy loop based on element size"); |
| |
| // next registers should be set before the jump to corresponding stub |
| const Register from = c_rarg0; // source array address |
| const Register to = c_rarg1; // destination array address |
| const Register count = c_rarg2; // elements count |
| |
| // 'from', 'to', 'count' registers should be set in such order |
| // since they are the same as 'src', 'src_pos', 'dst'. |
| |
| assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); |
| |
| // The possible values of elsize are 0-3, i.e. exact_log2(element |
| // size in bytes). We do a simple bitwise binary search. |
| __ BIND(L_copy_bytes); |
| __ tbnz(r15_elsize, 1, L_copy_ints); |
| __ tbnz(r15_elsize, 0, L_copy_shorts); |
| __ lea(from, Address(src, src_pos));// src_addr |
| __ lea(to, Address(dst, dst_pos));// dst_addr |
| __ movw(count, scratch_length); // length |
| __ b(RuntimeAddress(byte_copy_entry)); |
| |
| __ BIND(L_copy_shorts); |
| __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr |
| __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr |
| __ movw(count, scratch_length); // length |
| __ b(RuntimeAddress(short_copy_entry)); |
| |
| __ BIND(L_copy_ints); |
| __ tbnz(r15_elsize, 0, L_copy_longs); |
| __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr |
| __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr |
| __ movw(count, scratch_length); // length |
| __ b(RuntimeAddress(int_copy_entry)); |
| |
| __ BIND(L_copy_longs); |
| #ifdef ASSERT |
| { |
| BLOCK_COMMENT("assert long copy {"); |
| Label L; |
| __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize |
| __ cmpw(r15_elsize, LogBytesPerLong); |
| __ br(Assembler::EQ, L); |
| __ stop("must be long copy, but elsize is wrong"); |
| __ bind(L); |
| BLOCK_COMMENT("} assert long copy done"); |
| } |
| #endif |
| __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr |
| __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr |
| __ movw(count, scratch_length); // length |
| __ b(RuntimeAddress(long_copy_entry)); |
| |
| // ObjArrayKlass |
| __ BIND(L_objArray); |
| // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] |
| |
| Label L_plain_copy, L_checkcast_copy; |
| // test array classes for subtyping |
| __ load_klass(r15, dst); |
| __ cmp(scratch_src_klass, r15); // usual case is exact equality |
| __ br(Assembler::NE, L_checkcast_copy); |
| |
| // Identically typed arrays can be copied without element-wise checks. |
| arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, |
| rscratch2, L_failed); |
| |
| __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); |
| __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); |
| __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); |
| __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); |
| __ movw(count, scratch_length); // length |
| __ BIND(L_plain_copy); |
| __ b(RuntimeAddress(oop_copy_entry)); |
| |
| __ BIND(L_checkcast_copy); |
| // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) |
| { |
| // Before looking at dst.length, make sure dst is also an objArray. |
| __ ldrw(rscratch1, Address(r15, lh_offset)); |
| __ movw(rscratch2, objArray_lh); |
| __ eorw(rscratch1, rscratch1, rscratch2); |
| __ cbnzw(rscratch1, L_failed); |
| |
| // It is safe to examine both src.length and dst.length. |
| arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, |
| r15, L_failed); |
| |
| __ load_klass(dst_klass, dst); // reload |
| |
| // Marshal the base address arguments now, freeing registers. |
| __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); |
| __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); |
| __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); |
| __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); |
| __ movw(count, length); // length (reloaded) |
| Register sco_temp = c_rarg3; // this register is free now |
| assert_different_registers(from, to, count, sco_temp, |
| dst_klass, scratch_src_klass); |
| // assert_clean_int(count, sco_temp); |
| |
| // Generate the type check. |
| const int sco_offset = in_bytes(Klass::super_check_offset_offset()); |
| __ ldrw(sco_temp, Address(dst_klass, sco_offset)); |
| |
| // Smashes rscratch1, rscratch2 |
| generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); |
| |
| // Fetch destination element klass from the ObjArrayKlass header. |
| int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); |
| __ ldr(dst_klass, Address(dst_klass, ek_offset)); |
| __ ldrw(sco_temp, Address(dst_klass, sco_offset)); |
| |
| // the checkcast_copy loop needs two extra arguments: |
| assert(c_rarg3 == sco_temp, "#3 already in place"); |
| // Set up arguments for checkcast_copy_entry. |
| __ mov(c_rarg4, dst_klass); // dst.klass.element_klass |
| __ b(RuntimeAddress(checkcast_copy_entry)); |
| } |
| |
| __ BIND(L_failed); |
| __ mov(r0, -1); |
| __ leave(); // required for proper stackwalking of RuntimeStub frame |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // |
| // Generate stub for array fill. If "aligned" is true, the |
| // "to" address is assumed to be heapword aligned. |
| // |
| // Arguments for generated stub: |
| // to: c_rarg0 |
| // value: c_rarg1 |
| // count: c_rarg2 treated as signed |
| // |
| address generate_fill(BasicType t, bool aligned, const char *name) { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| |
| BLOCK_COMMENT("Entry:"); |
| |
| const Register to = c_rarg0; // source array address |
| const Register value = c_rarg1; // value |
| const Register count = c_rarg2; // elements count |
| |
| const Register bz_base = r10; // base for block_zero routine |
| const Register cnt_words = r11; // temp register |
| |
| __ enter(); |
| |
| Label L_fill_elements, L_exit1; |
| |
| int shift = -1; |
| switch (t) { |
| case T_BYTE: |
| shift = 0; |
| __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element |
| __ bfi(value, value, 8, 8); // 8 bit -> 16 bit |
| __ bfi(value, value, 16, 16); // 16 bit -> 32 bit |
| __ br(Assembler::LO, L_fill_elements); |
| break; |
| case T_SHORT: |
| shift = 1; |
| __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element |
| __ bfi(value, value, 16, 16); // 16 bit -> 32 bit |
| __ br(Assembler::LO, L_fill_elements); |
| break; |
| case T_INT: |
| shift = 2; |
| __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element |
| __ br(Assembler::LO, L_fill_elements); |
| break; |
| default: ShouldNotReachHere(); |
| } |
| |
| // Align source address at 8 bytes address boundary. |
| Label L_skip_align1, L_skip_align2, L_skip_align4; |
| if (!aligned) { |
| switch (t) { |
| case T_BYTE: |
| // One byte misalignment happens only for byte arrays. |
| __ tbz(to, 0, L_skip_align1); |
| __ strb(value, Address(__ post(to, 1))); |
| __ subw(count, count, 1); |
| __ bind(L_skip_align1); |
| // Fallthrough |
| case T_SHORT: |
| // Two bytes misalignment happens only for byte and short (char) arrays. |
| __ tbz(to, 1, L_skip_align2); |
| __ strh(value, Address(__ post(to, 2))); |
| __ subw(count, count, 2 >> shift); |
| __ bind(L_skip_align2); |
| // Fallthrough |
| case T_INT: |
| // Align to 8 bytes, we know we are 4 byte aligned to start. |
| __ tbz(to, 2, L_skip_align4); |
| __ strw(value, Address(__ post(to, 4))); |
| __ subw(count, count, 4 >> shift); |
| __ bind(L_skip_align4); |
| break; |
| default: ShouldNotReachHere(); |
| } |
| } |
| |
| // |
| // Fill large chunks |
| // |
| __ lsrw(cnt_words, count, 3 - shift); // number of words |
| __ bfi(value, value, 32, 32); // 32 bit -> 64 bit |
| __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); |
| if (UseBlockZeroing) { |
| Label non_block_zeroing, rest; |
| // If the fill value is zero we can use the fast zero_words(). |
| __ cbnz(value, non_block_zeroing); |
| __ mov(bz_base, to); |
| __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); |
| address tpc = __ zero_words(bz_base, cnt_words); |
| if (tpc == nullptr) { |
| fatal("CodeCache is full at generate_fill"); |
| } |
| __ b(rest); |
| __ bind(non_block_zeroing); |
| __ fill_words(to, cnt_words, value); |
| __ bind(rest); |
| } else { |
| __ fill_words(to, cnt_words, value); |
| } |
| |
| // Remaining count is less than 8 bytes. Fill it by a single store. |
| // Note that the total length is no less than 8 bytes. |
| if (t == T_BYTE || t == T_SHORT) { |
| Label L_exit1; |
| __ cbzw(count, L_exit1); |
| __ add(to, to, count, Assembler::LSL, shift); // points to the end |
| __ str(value, Address(to, -8)); // overwrite some elements |
| __ bind(L_exit1); |
| __ leave(); |
| __ ret(lr); |
| } |
| |
| // Handle copies less than 8 bytes. |
| Label L_fill_2, L_fill_4, L_exit2; |
| __ bind(L_fill_elements); |
| switch (t) { |
| case T_BYTE: |
| __ tbz(count, 0, L_fill_2); |
| __ strb(value, Address(__ post(to, 1))); |
| __ bind(L_fill_2); |
| __ tbz(count, 1, L_fill_4); |
| __ strh(value, Address(__ post(to, 2))); |
| __ bind(L_fill_4); |
| __ tbz(count, 2, L_exit2); |
| __ strw(value, Address(to)); |
| break; |
| case T_SHORT: |
| __ tbz(count, 0, L_fill_4); |
| __ strh(value, Address(__ post(to, 2))); |
| __ bind(L_fill_4); |
| __ tbz(count, 1, L_exit2); |
| __ strw(value, Address(to)); |
| break; |
| case T_INT: |
| __ cbzw(count, L_exit2); |
| __ strw(value, Address(to)); |
| break; |
| default: ShouldNotReachHere(); |
| } |
| __ bind(L_exit2); |
| __ leave(); |
| __ ret(lr); |
| return start; |
| } |
| |
| address generate_data_cache_writeback() { |
| const Register line = c_rarg0; // address of line to write back |
| |
| __ align(CodeEntryAlignment); |
| |
| StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); |
| |
| address start = __ pc(); |
| __ enter(); |
| __ cache_wb(Address(line, 0)); |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| address generate_data_cache_writeback_sync() { |
| const Register is_pre = c_rarg0; // pre or post sync |
| |
| __ align(CodeEntryAlignment); |
| |
| StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); |
| |
| // pre wbsync is a no-op |
| // post wbsync translates to an sfence |
| |
| Label skip; |
| address start = __ pc(); |
| __ enter(); |
| __ cbnz(is_pre, skip); |
| __ cache_wbsync(false); |
| __ bind(skip); |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| void generate_arraycopy_stubs() { |
| address entry; |
| address entry_jbyte_arraycopy; |
| address entry_jshort_arraycopy; |
| address entry_jint_arraycopy; |
| address entry_oop_arraycopy; |
| address entry_jlong_arraycopy; |
| address entry_checkcast_arraycopy; |
| |
| generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); |
| generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); |
| |
| generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); |
| generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); |
| |
| generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); |
| generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); |
| |
| StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); |
| |
| //*** jbyte |
| // Always need aligned and unaligned versions |
| StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, |
| "jbyte_disjoint_arraycopy"); |
| StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, |
| &entry_jbyte_arraycopy, |
| "jbyte_arraycopy"); |
| StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, |
| "arrayof_jbyte_disjoint_arraycopy"); |
| StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, |
| "arrayof_jbyte_arraycopy"); |
| |
| //*** jshort |
| // Always need aligned and unaligned versions |
| StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, |
| "jshort_disjoint_arraycopy"); |
| StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, |
| &entry_jshort_arraycopy, |
| "jshort_arraycopy"); |
| StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, |
| "arrayof_jshort_disjoint_arraycopy"); |
| StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, |
| "arrayof_jshort_arraycopy"); |
| |
| //*** jint |
| // Aligned versions |
| StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, |
| "arrayof_jint_disjoint_arraycopy"); |
| StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, |
| "arrayof_jint_arraycopy"); |
| // In 64 bit we need both aligned and unaligned versions of jint arraycopy. |
| // entry_jint_arraycopy always points to the unaligned version |
| StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, |
| "jint_disjoint_arraycopy"); |
| StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, |
| &entry_jint_arraycopy, |
| "jint_arraycopy"); |
| |
| //*** jlong |
| // It is always aligned |
| StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, |
| "arrayof_jlong_disjoint_arraycopy"); |
| StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, |
| "arrayof_jlong_arraycopy"); |
| StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; |
| StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; |
| |
| //*** oops |
| { |
| // With compressed oops we need unaligned versions; notice that |
| // we overwrite entry_oop_arraycopy. |
| bool aligned = !UseCompressedOops; |
| |
| StubRoutines::_arrayof_oop_disjoint_arraycopy |
| = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", |
| /*dest_uninitialized*/false); |
| StubRoutines::_arrayof_oop_arraycopy |
| = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", |
| /*dest_uninitialized*/false); |
| // Aligned versions without pre-barriers |
| StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit |
| = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", |
| /*dest_uninitialized*/true); |
| StubRoutines::_arrayof_oop_arraycopy_uninit |
| = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", |
| /*dest_uninitialized*/true); |
| } |
| |
| StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; |
| StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; |
| StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; |
| StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; |
| |
| StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); |
| StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, |
| /*dest_uninitialized*/true); |
| |
| StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", |
| entry_jbyte_arraycopy, |
| entry_jshort_arraycopy, |
| entry_jint_arraycopy, |
| entry_jlong_arraycopy); |
| |
| StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", |
| entry_jbyte_arraycopy, |
| entry_jshort_arraycopy, |
| entry_jint_arraycopy, |
| entry_oop_arraycopy, |
| entry_jlong_arraycopy, |
| entry_checkcast_arraycopy); |
| |
| StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); |
| StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); |
| StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); |
| StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); |
| StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); |
| StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); |
| } |
| |
| void generate_math_stubs() { Unimplemented(); } |
| |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - source byte array address |
| // c_rarg1 - destination byte array address |
| // c_rarg2 - K (key) in little endian int array |
| // |
| address generate_aescrypt_encryptBlock() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); |
| |
| const Register from = c_rarg0; // source array address |
| const Register to = c_rarg1; // destination array address |
| const Register key = c_rarg2; // key array address |
| const Register keylen = rscratch1; |
| |
| address start = __ pc(); |
| __ enter(); |
| |
| __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| |
| __ aesenc_loadkeys(key, keylen); |
| __ aesecb_encrypt(from, to, keylen); |
| |
| __ mov(r0, 0); |
| |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - source byte array address |
| // c_rarg1 - destination byte array address |
| // c_rarg2 - K (key) in little endian int array |
| // |
| address generate_aescrypt_decryptBlock() { |
| assert(UseAES, "need AES cryptographic extension support"); |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); |
| Label L_doLast; |
| |
| const Register from = c_rarg0; // source array address |
| const Register to = c_rarg1; // destination array address |
| const Register key = c_rarg2; // key array address |
| const Register keylen = rscratch1; |
| |
| address start = __ pc(); |
| __ enter(); // required for proper stackwalking of RuntimeStub frame |
| |
| __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| |
| __ aesecb_decrypt(from, to, key, keylen); |
| |
| __ mov(r0, 0); |
| |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - source byte array address |
| // c_rarg1 - destination byte array address |
| // c_rarg2 - K (key) in little endian int array |
| // c_rarg3 - r vector byte array address |
| // c_rarg4 - input length |
| // |
| // Output: |
| // x0 - input length |
| // |
| address generate_cipherBlockChaining_encryptAESCrypt() { |
| assert(UseAES, "need AES cryptographic extension support"); |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); |
| |
| Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; |
| |
| const Register from = c_rarg0; // source array address |
| const Register to = c_rarg1; // destination array address |
| const Register key = c_rarg2; // key array address |
| const Register rvec = c_rarg3; // r byte array initialized from initvector array address |
| // and left with the results of the last encryption block |
| const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) |
| const Register keylen = rscratch1; |
| |
| address start = __ pc(); |
| |
| __ enter(); |
| |
| __ movw(rscratch2, len_reg); |
| |
| __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| |
| __ ld1(v0, __ T16B, rvec); |
| |
| __ cmpw(keylen, 52); |
| __ br(Assembler::CC, L_loadkeys_44); |
| __ br(Assembler::EQ, L_loadkeys_52); |
| |
| __ ld1(v17, v18, __ T16B, __ post(key, 32)); |
| __ rev32(v17, __ T16B, v17); |
| __ rev32(v18, __ T16B, v18); |
| __ BIND(L_loadkeys_52); |
| __ ld1(v19, v20, __ T16B, __ post(key, 32)); |
| __ rev32(v19, __ T16B, v19); |
| __ rev32(v20, __ T16B, v20); |
| __ BIND(L_loadkeys_44); |
| __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); |
| __ rev32(v21, __ T16B, v21); |
| __ rev32(v22, __ T16B, v22); |
| __ rev32(v23, __ T16B, v23); |
| __ rev32(v24, __ T16B, v24); |
| __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); |
| __ rev32(v25, __ T16B, v25); |
| __ rev32(v26, __ T16B, v26); |
| __ rev32(v27, __ T16B, v27); |
| __ rev32(v28, __ T16B, v28); |
| __ ld1(v29, v30, v31, __ T16B, key); |
| __ rev32(v29, __ T16B, v29); |
| __ rev32(v30, __ T16B, v30); |
| __ rev32(v31, __ T16B, v31); |
| |
| __ BIND(L_aes_loop); |
| __ ld1(v1, __ T16B, __ post(from, 16)); |
| __ eor(v0, __ T16B, v0, v1); |
| |
| __ br(Assembler::CC, L_rounds_44); |
| __ br(Assembler::EQ, L_rounds_52); |
| |
| __ aese(v0, v17); __ aesmc(v0, v0); |
| __ aese(v0, v18); __ aesmc(v0, v0); |
| __ BIND(L_rounds_52); |
| __ aese(v0, v19); __ aesmc(v0, v0); |
| __ aese(v0, v20); __ aesmc(v0, v0); |
| __ BIND(L_rounds_44); |
| __ aese(v0, v21); __ aesmc(v0, v0); |
| __ aese(v0, v22); __ aesmc(v0, v0); |
| __ aese(v0, v23); __ aesmc(v0, v0); |
| __ aese(v0, v24); __ aesmc(v0, v0); |
| __ aese(v0, v25); __ aesmc(v0, v0); |
| __ aese(v0, v26); __ aesmc(v0, v0); |
| __ aese(v0, v27); __ aesmc(v0, v0); |
| __ aese(v0, v28); __ aesmc(v0, v0); |
| __ aese(v0, v29); __ aesmc(v0, v0); |
| __ aese(v0, v30); |
| __ eor(v0, __ T16B, v0, v31); |
| |
| __ st1(v0, __ T16B, __ post(to, 16)); |
| |
| __ subw(len_reg, len_reg, 16); |
| __ cbnzw(len_reg, L_aes_loop); |
| |
| __ st1(v0, __ T16B, rvec); |
| |
| __ mov(r0, rscratch2); |
| |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - source byte array address |
| // c_rarg1 - destination byte array address |
| // c_rarg2 - K (key) in little endian int array |
| // c_rarg3 - r vector byte array address |
| // c_rarg4 - input length |
| // |
| // Output: |
| // r0 - input length |
| // |
| address generate_cipherBlockChaining_decryptAESCrypt() { |
| assert(UseAES, "need AES cryptographic extension support"); |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); |
| |
| Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; |
| |
| const Register from = c_rarg0; // source array address |
| const Register to = c_rarg1; // destination array address |
| const Register key = c_rarg2; // key array address |
| const Register rvec = c_rarg3; // r byte array initialized from initvector array address |
| // and left with the results of the last encryption block |
| const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) |
| const Register keylen = rscratch1; |
| |
| address start = __ pc(); |
| |
| __ enter(); |
| |
| __ movw(rscratch2, len_reg); |
| |
| __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| |
| __ ld1(v2, __ T16B, rvec); |
| |
| __ ld1(v31, __ T16B, __ post(key, 16)); |
| __ rev32(v31, __ T16B, v31); |
| |
| __ cmpw(keylen, 52); |
| __ br(Assembler::CC, L_loadkeys_44); |
| __ br(Assembler::EQ, L_loadkeys_52); |
| |
| __ ld1(v17, v18, __ T16B, __ post(key, 32)); |
| __ rev32(v17, __ T16B, v17); |
| __ rev32(v18, __ T16B, v18); |
| __ BIND(L_loadkeys_52); |
| __ ld1(v19, v20, __ T16B, __ post(key, 32)); |
| __ rev32(v19, __ T16B, v19); |
| __ rev32(v20, __ T16B, v20); |
| __ BIND(L_loadkeys_44); |
| __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); |
| __ rev32(v21, __ T16B, v21); |
| __ rev32(v22, __ T16B, v22); |
| __ rev32(v23, __ T16B, v23); |
| __ rev32(v24, __ T16B, v24); |
| __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); |
| __ rev32(v25, __ T16B, v25); |
| __ rev32(v26, __ T16B, v26); |
| __ rev32(v27, __ T16B, v27); |
| __ rev32(v28, __ T16B, v28); |
| __ ld1(v29, v30, __ T16B, key); |
| __ rev32(v29, __ T16B, v29); |
| __ rev32(v30, __ T16B, v30); |
| |
| __ BIND(L_aes_loop); |
| __ ld1(v0, __ T16B, __ post(from, 16)); |
| __ orr(v1, __ T16B, v0, v0); |
| |
| __ br(Assembler::CC, L_rounds_44); |
| __ br(Assembler::EQ, L_rounds_52); |
| |
| __ aesd(v0, v17); __ aesimc(v0, v0); |
| __ aesd(v0, v18); __ aesimc(v0, v0); |
| __ BIND(L_rounds_52); |
| __ aesd(v0, v19); __ aesimc(v0, v0); |
| __ aesd(v0, v20); __ aesimc(v0, v0); |
| __ BIND(L_rounds_44); |
| __ aesd(v0, v21); __ aesimc(v0, v0); |
| __ aesd(v0, v22); __ aesimc(v0, v0); |
| __ aesd(v0, v23); __ aesimc(v0, v0); |
| __ aesd(v0, v24); __ aesimc(v0, v0); |
| __ aesd(v0, v25); __ aesimc(v0, v0); |
| __ aesd(v0, v26); __ aesimc(v0, v0); |
| __ aesd(v0, v27); __ aesimc(v0, v0); |
| __ aesd(v0, v28); __ aesimc(v0, v0); |
| __ aesd(v0, v29); __ aesimc(v0, v0); |
| __ aesd(v0, v30); |
| __ eor(v0, __ T16B, v0, v31); |
| __ eor(v0, __ T16B, v0, v2); |
| |
| __ st1(v0, __ T16B, __ post(to, 16)); |
| __ orr(v2, __ T16B, v1, v1); |
| |
| __ subw(len_reg, len_reg, 16); |
| __ cbnzw(len_reg, L_aes_loop); |
| |
| __ st1(v2, __ T16B, rvec); |
| |
| __ mov(r0, rscratch2); |
| |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Big-endian 128-bit + 64-bit -> 128-bit addition. |
| // Inputs: 128-bits. in is preserved. |
| // The least-significant 64-bit word is in the upper dword of each vector. |
| // inc (the 64-bit increment) is preserved. Its lower dword must be zero. |
| // Output: result |
| void be_add_128_64(FloatRegister result, FloatRegister in, |
| FloatRegister inc, FloatRegister tmp) { |
| assert_different_registers(result, tmp, inc); |
| |
| __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of |
| // input |
| __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing |
| __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and |
| // MSD == 0 (must be!) to LSD |
| __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow |
| } |
| |
| // CTR AES crypt. |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - source byte array address |
| // c_rarg1 - destination byte array address |
| // c_rarg2 - K (key) in little endian int array |
| // c_rarg3 - counter vector byte array address |
| // c_rarg4 - input length |
| // c_rarg5 - saved encryptedCounter start |
| // c_rarg6 - saved used length |
| // |
| // Output: |
| // r0 - input length |
| // |
| address generate_counterMode_AESCrypt() { |
| const Register in = c_rarg0; |
| const Register out = c_rarg1; |
| const Register key = c_rarg2; |
| const Register counter = c_rarg3; |
| const Register saved_len = c_rarg4, len = r10; |
| const Register saved_encrypted_ctr = c_rarg5; |
| const Register used_ptr = c_rarg6, used = r12; |
| |
| const Register offset = r7; |
| const Register keylen = r11; |
| |
| const unsigned char block_size = 16; |
| const int bulk_width = 4; |
| // NB: bulk_width can be 4 or 8. 8 gives slightly faster |
| // performance with larger data sizes, but it also means that the |
| // fast path isn't used until you have at least 8 blocks, and up |
| // to 127 bytes of data will be executed on the slow path. For |
| // that reason, and also so as not to blow away too much icache, 4 |
| // blocks seems like a sensible compromise. |
| |
| // Algorithm: |
| // |
| // if (len == 0) { |
| // goto DONE; |
| // } |
| // int result = len; |
| // do { |
| // if (used >= blockSize) { |
| // if (len >= bulk_width * blockSize) { |
| // CTR_large_block(); |
| // if (len == 0) |
| // goto DONE; |
| // } |
| // for (;;) { |
| // 16ByteVector v0 = counter; |
| // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); |
| // used = 0; |
| // if (len < blockSize) |
| // break; /* goto NEXT */ |
| // 16ByteVector v1 = load16Bytes(in, offset); |
| // v1 = v1 ^ encryptedCounter; |
| // store16Bytes(out, offset); |
| // used = blockSize; |
| // offset += blockSize; |
| // len -= blockSize; |
| // if (len == 0) |
| // goto DONE; |
| // } |
| // } |
| // NEXT: |
| // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); |
| // len--; |
| // } while (len != 0); |
| // DONE: |
| // return result; |
| // |
| // CTR_large_block() |
| // Wide bulk encryption of whole blocks. |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); |
| const address start = __ pc(); |
| __ enter(); |
| |
| Label DONE, CTR_large_block, large_block_return; |
| __ ldrw(used, Address(used_ptr)); |
| __ cbzw(saved_len, DONE); |
| |
| __ mov(len, saved_len); |
| __ mov(offset, 0); |
| |
| // Compute #rounds for AES based on the length of the key array |
| __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| |
| __ aesenc_loadkeys(key, keylen); |
| |
| { |
| Label L_CTR_loop, NEXT; |
| |
| __ bind(L_CTR_loop); |
| |
| __ cmp(used, block_size); |
| __ br(__ LO, NEXT); |
| |
| // Maybe we have a lot of data |
| __ subsw(rscratch1, len, bulk_width * block_size); |
| __ br(__ HS, CTR_large_block); |
| __ BIND(large_block_return); |
| __ cbzw(len, DONE); |
| |
| // Setup the counter |
| __ movi(v4, __ T4S, 0); |
| __ movi(v5, __ T4S, 1); |
| __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } |
| |
| // 128-bit big-endian increment |
| __ ld1(v0, __ T16B, counter); |
| __ rev64(v16, __ T16B, v0); |
| be_add_128_64(v16, v16, v4, /*tmp*/v5); |
| __ rev64(v16, __ T16B, v16); |
| __ st1(v16, __ T16B, counter); |
| // Previous counter value is in v0 |
| // v4 contains { 0, 1 } |
| |
| { |
| // We have fewer than bulk_width blocks of data left. Encrypt |
| // them one by one until there is less than a full block |
| // remaining, being careful to save both the encrypted counter |
| // and the counter. |
| |
| Label inner_loop; |
| __ bind(inner_loop); |
| // Counter to encrypt is in v0 |
| __ aesecb_encrypt(noreg, noreg, keylen); |
| __ st1(v0, __ T16B, saved_encrypted_ctr); |
| |
| // Do we have a remaining full block? |
| |
| __ mov(used, 0); |
| __ cmp(len, block_size); |
| __ br(__ LO, NEXT); |
| |
| // Yes, we have a full block |
| __ ldrq(v1, Address(in, offset)); |
| __ eor(v1, __ T16B, v1, v0); |
| __ strq(v1, Address(out, offset)); |
| __ mov(used, block_size); |
| __ add(offset, offset, block_size); |
| |
| __ subw(len, len, block_size); |
| __ cbzw(len, DONE); |
| |
| // Increment the counter, store it back |
| __ orr(v0, __ T16B, v16, v16); |
| __ rev64(v16, __ T16B, v16); |
| be_add_128_64(v16, v16, v4, /*tmp*/v5); |
| __ rev64(v16, __ T16B, v16); |
| __ st1(v16, __ T16B, counter); // Save the incremented counter back |
| |
| __ b(inner_loop); |
| } |
| |
| __ BIND(NEXT); |
| |
| // Encrypt a single byte, and loop. |
| // We expect this to be a rare event. |
| __ ldrb(rscratch1, Address(in, offset)); |
| __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); |
| __ eor(rscratch1, rscratch1, rscratch2); |
| __ strb(rscratch1, Address(out, offset)); |
| __ add(offset, offset, 1); |
| __ add(used, used, 1); |
| __ subw(len, len,1); |
| __ cbnzw(len, L_CTR_loop); |
| } |
| |
| __ bind(DONE); |
| __ strw(used, Address(used_ptr)); |
| __ mov(r0, saved_len); |
| |
| __ leave(); // required for proper stackwalking of RuntimeStub frame |
| __ ret(lr); |
| |
| // Bulk encryption |
| |
| __ BIND (CTR_large_block); |
| assert(bulk_width == 4 || bulk_width == 8, "must be"); |
| |
| if (bulk_width == 8) { |
| __ sub(sp, sp, 4 * 16); |
| __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); |
| } |
| __ sub(sp, sp, 4 * 16); |
| __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); |
| RegSet saved_regs = (RegSet::of(in, out, offset) |
| + RegSet::of(saved_encrypted_ctr, used_ptr, len)); |
| __ push(saved_regs, sp); |
| __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption |
| __ add(in, in, offset); |
| __ add(out, out, offset); |
| |
| // Keys should already be loaded into the correct registers |
| |
| __ ld1(v0, __ T16B, counter); // v0 contains the first counter |
| __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter |
| |
| // AES/CTR loop |
| { |
| Label L_CTR_loop; |
| __ BIND(L_CTR_loop); |
| |
| // Setup the counters |
| __ movi(v8, __ T4S, 0); |
| __ movi(v9, __ T4S, 1); |
| __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } |
| |
| for (int i = 0; i < bulk_width; i++) { |
| FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); |
| __ rev64(v0_ofs, __ T16B, v16); |
| be_add_128_64(v16, v16, v8, /*tmp*/v9); |
| } |
| |
| __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); |
| |
| // Encrypt the counters |
| __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); |
| |
| if (bulk_width == 8) { |
| __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); |
| } |
| |
| // XOR the encrypted counters with the inputs |
| for (int i = 0; i < bulk_width; i++) { |
| FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); |
| FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); |
| __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); |
| } |
| |
| // Write the encrypted data |
| __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); |
| if (bulk_width == 8) { |
| __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); |
| } |
| |
| __ subw(len, len, 16 * bulk_width); |
| __ cbnzw(len, L_CTR_loop); |
| } |
| |
| // Save the counter back where it goes |
| __ rev64(v16, __ T16B, v16); |
| __ st1(v16, __ T16B, counter); |
| |
| __ pop(saved_regs, sp); |
| |
| __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); |
| if (bulk_width == 8) { |
| __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); |
| } |
| |
| __ andr(rscratch1, len, -16 * bulk_width); |
| __ sub(len, len, rscratch1); |
| __ add(offset, offset, rscratch1); |
| __ mov(used, 16); |
| __ strw(used, Address(used_ptr)); |
| __ b(large_block_return); |
| |
| return start; |
| } |
| |
| // Vector AES Galois Counter Mode implementation. Parameters: |
| // |
| // in = c_rarg0 |
| // len = c_rarg1 |
| // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) |
| // out = c_rarg3 |
| // key = c_rarg4 |
| // state = c_rarg5 - GHASH.state |
| // subkeyHtbl = c_rarg6 - powers of H |
| // counter = c_rarg7 - 16 bytes of CTR |
| // return - number of processed bytes |
| address generate_galoisCounterMode_AESCrypt() { |
| address ghash_polynomial = __ pc(); |
| __ emit_int64(0x87); // The low-order bits of the field |
| // polynomial (i.e. p = z^7+z^2+z+1) |
| // repeated in the low and high parts of a |
| // 128-bit vector |
| __ emit_int64(0x87); |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); |
| address start = __ pc(); |
| __ enter(); |
| |
| const Register in = c_rarg0; |
| const Register len = c_rarg1; |
| const Register ct = c_rarg2; |
| const Register out = c_rarg3; |
| // and updated with the incremented counter in the end |
| |
| const Register key = c_rarg4; |
| const Register state = c_rarg5; |
| |
| const Register subkeyHtbl = c_rarg6; |
| |
| const Register counter = c_rarg7; |
| |
| const Register keylen = r10; |
| // Save state before entering routine |
| __ sub(sp, sp, 4 * 16); |
| __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); |
| __ sub(sp, sp, 4 * 16); |
| __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); |
| |
| // __ andr(len, len, -512); |
| __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption |
| __ str(len, __ pre(sp, -2 * wordSize)); |
| |
| Label DONE; |
| __ cbz(len, DONE); |
| |
| // Compute #rounds for AES based on the length of the key array |
| __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
| |
| __ aesenc_loadkeys(key, keylen); |
| __ ld1(v0, __ T16B, counter); // v0 contains the first counter |
| __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter |
| |
| // AES/CTR loop |
| { |
| Label L_CTR_loop; |
| __ BIND(L_CTR_loop); |
| |
| // Setup the counters |
| __ movi(v8, __ T4S, 0); |
| __ movi(v9, __ T4S, 1); |
| __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } |
| |
| assert(v0->encoding() < v8->encoding(), ""); |
| for (int i = v0->encoding(); i < v8->encoding(); i++) { |
| FloatRegister f = as_FloatRegister(i); |
| __ rev32(f, __ T16B, v16); |
| __ addv(v16, __ T4S, v16, v8); |
| } |
| |
| __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); |
| |
| // Encrypt the counters |
| __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); |
| |
| __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); |
| |
| // XOR the encrypted counters with the inputs |
| for (int i = 0; i < 8; i++) { |
| FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); |
| FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); |
| __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); |
| } |
| __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); |
| __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); |
| |
| __ subw(len, len, 16 * 8); |
| __ cbnzw(len, L_CTR_loop); |
| } |
| |
| __ rev32(v16, __ T16B, v16); |
| __ st1(v16, __ T16B, counter); |
| |
| __ ldr(len, Address(sp)); |
| __ lsr(len, len, exact_log2(16)); // We want the count of blocks |
| |
| // GHASH/CTR loop |
| __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, |
| len, /*unrolls*/4); |
| |
| #ifdef ASSERT |
| { Label L; |
| __ cmp(len, (unsigned char)0); |
| __ br(Assembler::EQ, L); |
| __ stop("stubGenerator: abort"); |
| __ bind(L); |
| } |
| #endif |
| |
| __ bind(DONE); |
| // Return the number of bytes processed |
| __ ldr(r0, __ post(sp, 2 * wordSize)); |
| |
| __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); |
| __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); |
| |
| __ leave(); // required for proper stackwalking of RuntimeStub frame |
| __ ret(lr); |
| return start; |
| } |
| |
| class Cached64Bytes { |
| private: |
| MacroAssembler *_masm; |
| Register _regs[8]; |
| |
| public: |
| Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { |
| assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); |
| auto it = rs.begin(); |
| for (auto &r: _regs) { |
| r = *it; |
| ++it; |
| } |
| } |
| |
| void gen_loads(Register base) { |
| for (int i = 0; i < 8; i += 2) { |
| __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); |
| } |
| } |
| |
| // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. |
| void extract_u32(Register dest, int i) { |
| __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); |
| } |
| }; |
| |
| // Utility routines for md5. |
| // Clobbers r10 and r11. |
| void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, |
| int k, int s, int t) { |
| Register rscratch3 = r10; |
| Register rscratch4 = r11; |
| |
| __ eorw(rscratch3, r3, r4); |
| __ movw(rscratch2, t); |
| __ andw(rscratch3, rscratch3, r2); |
| __ addw(rscratch4, r1, rscratch2); |
| reg_cache.extract_u32(rscratch1, k); |
| __ eorw(rscratch3, rscratch3, r4); |
| __ addw(rscratch4, rscratch4, rscratch1); |
| __ addw(rscratch3, rscratch3, rscratch4); |
| __ rorw(rscratch2, rscratch3, 32 - s); |
| __ addw(r1, rscratch2, r2); |
| } |
| |
| void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, |
| int k, int s, int t) { |
| Register rscratch3 = r10; |
| Register rscratch4 = r11; |
| |
| __ andw(rscratch3, r2, r4); |
| __ bicw(rscratch4, r3, r4); |
| reg_cache.extract_u32(rscratch1, k); |
| __ movw(rscratch2, t); |
| __ orrw(rscratch3, rscratch3, rscratch4); |
| __ addw(rscratch4, r1, rscratch2); |
| __ addw(rscratch4, rscratch4, rscratch1); |
| __ addw(rscratch3, rscratch3, rscratch4); |
| __ rorw(rscratch2, rscratch3, 32 - s); |
| __ addw(r1, rscratch2, r2); |
| } |
| |
| void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, |
| int k, int s, int t) { |
| Register rscratch3 = r10; |
| Register rscratch4 = r11; |
| |
| __ eorw(rscratch3, r3, r4); |
| __ movw(rscratch2, t); |
| __ addw(rscratch4, r1, rscratch2); |
| reg_cache.extract_u32(rscratch1, k); |
| __ eorw(rscratch3, rscratch3, r2); |
| __ addw(rscratch4, rscratch4, rscratch1); |
| __ addw(rscratch3, rscratch3, rscratch4); |
| __ rorw(rscratch2, rscratch3, 32 - s); |
| __ addw(r1, rscratch2, r2); |
| } |
| |
| void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, |
| int k, int s, int t) { |
| Register rscratch3 = r10; |
| Register rscratch4 = r11; |
| |
| __ movw(rscratch3, t); |
| __ ornw(rscratch2, r2, r4); |
| __ addw(rscratch4, r1, rscratch3); |
| reg_cache.extract_u32(rscratch1, k); |
| __ eorw(rscratch3, rscratch2, r3); |
| __ addw(rscratch4, rscratch4, rscratch1); |
| __ addw(rscratch3, rscratch3, rscratch4); |
| __ rorw(rscratch2, rscratch3, 32 - s); |
| __ addw(r1, rscratch2, r2); |
| } |
| |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - byte[] source+offset |
| // c_rarg1 - int[] SHA.state |
| // c_rarg2 - int offset |
| // c_rarg3 - int limit |
| // |
| address generate_md5_implCompress(bool multi_block, const char *name) { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| |
| Register buf = c_rarg0; |
| Register state = c_rarg1; |
| Register ofs = c_rarg2; |
| Register limit = c_rarg3; |
| Register a = r4; |
| Register b = r5; |
| Register c = r6; |
| Register d = r7; |
| Register rscratch3 = r10; |
| Register rscratch4 = r11; |
| |
| Register state_regs[2] = { r12, r13 }; |
| RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; |
| Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers |
| |
| __ push(saved_regs, sp); |
| |
| __ ldp(state_regs[0], state_regs[1], Address(state)); |
| __ ubfx(a, state_regs[0], 0, 32); |
| __ ubfx(b, state_regs[0], 32, 32); |
| __ ubfx(c, state_regs[1], 0, 32); |
| __ ubfx(d, state_regs[1], 32, 32); |
| |
| Label md5_loop; |
| __ BIND(md5_loop); |
| |
| reg_cache.gen_loads(buf); |
| |
| // Round 1 |
| md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); |
| md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); |
| md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); |
| md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); |
| md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); |
| md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); |
| md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); |
| md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); |
| md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); |
| md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); |
| md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); |
| md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); |
| md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); |
| md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); |
| md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); |
| md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); |
| |
| // Round 2 |
| md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); |
| md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); |
| md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); |
| md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); |
| md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); |
| md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); |
| md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); |
| md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); |
| md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); |
| md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); |
| md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); |
| md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); |
| md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); |
| md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); |
| md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); |
| md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); |
| |
| // Round 3 |
| md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); |
| md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); |
| md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); |
| md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); |
| md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); |
| md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); |
| md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); |
| md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); |
| md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); |
| md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); |
| md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); |
| md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); |
| md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); |
| md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); |
| md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); |
| md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); |
| |
| // Round 4 |
| md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); |
| md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); |
| md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); |
| md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); |
| md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); |
| md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); |
| md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); |
| md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); |
| md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); |
| md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); |
| md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); |
| md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); |
| md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); |
| md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); |
| md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); |
| md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); |
| |
| __ addw(a, state_regs[0], a); |
| __ ubfx(rscratch2, state_regs[0], 32, 32); |
| __ addw(b, rscratch2, b); |
| __ addw(c, state_regs[1], c); |
| __ ubfx(rscratch4, state_regs[1], 32, 32); |
| __ addw(d, rscratch4, d); |
| |
| __ orr(state_regs[0], a, b, Assembler::LSL, 32); |
| __ orr(state_regs[1], c, d, Assembler::LSL, 32); |
| |
| if (multi_block) { |
| __ add(buf, buf, 64); |
| __ add(ofs, ofs, 64); |
| __ cmp(ofs, limit); |
| __ br(Assembler::LE, md5_loop); |
| __ mov(c_rarg0, ofs); // return ofs |
| } |
| |
| // write hash values back in the correct order |
| __ stp(state_regs[0], state_regs[1], Address(state)); |
| |
| __ pop(saved_regs, sp); |
| |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - byte[] source+offset |
| // c_rarg1 - int[] SHA.state |
| // c_rarg2 - int offset |
| // c_rarg3 - int limit |
| // |
| address generate_sha1_implCompress(bool multi_block, const char *name) { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| |
| Register buf = c_rarg0; |
| Register state = c_rarg1; |
| Register ofs = c_rarg2; |
| Register limit = c_rarg3; |
| |
| Label keys; |
| Label sha1_loop; |
| |
| // load the keys into v0..v3 |
| __ adr(rscratch1, keys); |
| __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); |
| // load 5 words state into v6, v7 |
| __ ldrq(v6, Address(state, 0)); |
| __ ldrs(v7, Address(state, 16)); |
| |
| |
| __ BIND(sha1_loop); |
| // load 64 bytes of data into v16..v19 |
| __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); |
| __ rev32(v16, __ T16B, v16); |
| __ rev32(v17, __ T16B, v17); |
| __ rev32(v18, __ T16B, v18); |
| __ rev32(v19, __ T16B, v19); |
| |
| // do the sha1 |
| __ addv(v4, __ T4S, v16, v0); |
| __ orr(v20, __ T16B, v6, v6); |
| |
| FloatRegister d0 = v16; |
| FloatRegister d1 = v17; |
| FloatRegister d2 = v18; |
| FloatRegister d3 = v19; |
| |
| for (int round = 0; round < 20; round++) { |
| FloatRegister tmp1 = (round & 1) ? v4 : v5; |
| FloatRegister tmp2 = (round & 1) ? v21 : v22; |
| FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; |
| FloatRegister tmp4 = (round & 1) ? v5 : v4; |
| FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); |
| |
| if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); |
| if (round < 19) __ addv(tmp1, __ T4S, d1, key); |
| __ sha1h(tmp2, __ T4S, v20); |
| if (round < 5) |
| __ sha1c(v20, __ T4S, tmp3, tmp4); |
| else if (round < 10 || round >= 15) |
| __ sha1p(v20, __ T4S, tmp3, tmp4); |
| else |
| __ sha1m(v20, __ T4S, tmp3, tmp4); |
| if (round < 16) __ sha1su1(d0, __ T4S, d3); |
| |
| tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; |
| } |
| |
| __ addv(v7, __ T2S, v7, v21); |
| __ addv(v6, __ T4S, v6, v20); |
| |
| if (multi_block) { |
| __ add(ofs, ofs, 64); |
| __ cmp(ofs, limit); |
| __ br(Assembler::LE, sha1_loop); |
| __ mov(c_rarg0, ofs); // return ofs |
| } |
| |
| __ strq(v6, Address(state, 0)); |
| __ strs(v7, Address(state, 16)); |
| |
| __ ret(lr); |
| |
| __ bind(keys); |
| __ emit_int32(0x5a827999); |
| __ emit_int32(0x6ed9eba1); |
| __ emit_int32(0x8f1bbcdc); |
| __ emit_int32(0xca62c1d6); |
| |
| return start; |
| } |
| |
| |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - byte[] source+offset |
| // c_rarg1 - int[] SHA.state |
| // c_rarg2 - int offset |
| // c_rarg3 - int limit |
| // |
| address generate_sha256_implCompress(bool multi_block, const char *name) { |
| static const uint32_t round_consts[64] = { |
| 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
| 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
| 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
| 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
| 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
| 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
| 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
| 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
| 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
| 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
| 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
| 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
| 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
| 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
| 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
| 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, |
| }; |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| |
| Register buf = c_rarg0; |
| Register state = c_rarg1; |
| Register ofs = c_rarg2; |
| Register limit = c_rarg3; |
| |
| Label sha1_loop; |
| |
| __ stpd(v8, v9, __ pre(sp, -32)); |
| __ stpd(v10, v11, Address(sp, 16)); |
| |
| // dga == v0 |
| // dgb == v1 |
| // dg0 == v2 |
| // dg1 == v3 |
| // dg2 == v4 |
| // t0 == v6 |
| // t1 == v7 |
| |
| // load 16 keys to v16..v31 |
| __ lea(rscratch1, ExternalAddress((address)round_consts)); |
| __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); |
| __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); |
| __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); |
| __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); |
| |
| // load 8 words (256 bits) state |
| __ ldpq(v0, v1, state); |
| |
| __ BIND(sha1_loop); |
| // load 64 bytes of data into v8..v11 |
| __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); |
| __ rev32(v8, __ T16B, v8); |
| __ rev32(v9, __ T16B, v9); |
| __ rev32(v10, __ T16B, v10); |
| __ rev32(v11, __ T16B, v11); |
| |
| __ addv(v6, __ T4S, v8, v16); |
| __ orr(v2, __ T16B, v0, v0); |
| __ orr(v3, __ T16B, v1, v1); |
| |
| FloatRegister d0 = v8; |
| FloatRegister d1 = v9; |
| FloatRegister d2 = v10; |
| FloatRegister d3 = v11; |
| |
| |
| for (int round = 0; round < 16; round++) { |
| FloatRegister tmp1 = (round & 1) ? v6 : v7; |
| FloatRegister tmp2 = (round & 1) ? v7 : v6; |
| FloatRegister tmp3 = (round & 1) ? v2 : v4; |
| FloatRegister tmp4 = (round & 1) ? v4 : v2; |
| |
| if (round < 12) __ sha256su0(d0, __ T4S, d1); |
| __ orr(v4, __ T16B, v2, v2); |
| if (round < 15) |
| __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); |
| __ sha256h(v2, __ T4S, v3, tmp2); |
| __ sha256h2(v3, __ T4S, v4, tmp2); |
| if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); |
| |
| tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; |
| } |
| |
| __ addv(v0, __ T4S, v0, v2); |
| __ addv(v1, __ T4S, v1, v3); |
| |
| if (multi_block) { |
| __ add(ofs, ofs, 64); |
| __ cmp(ofs, limit); |
| __ br(Assembler::LE, sha1_loop); |
| __ mov(c_rarg0, ofs); // return ofs |
| } |
| |
| __ ldpd(v10, v11, Address(sp, 16)); |
| __ ldpd(v8, v9, __ post(sp, 32)); |
| |
| __ stpq(v0, v1, state); |
| |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Double rounds for sha512. |
| void sha512_dround(int dr, |
| FloatRegister vi0, FloatRegister vi1, |
| FloatRegister vi2, FloatRegister vi3, |
| FloatRegister vi4, FloatRegister vrc0, |
| FloatRegister vrc1, FloatRegister vin0, |
| FloatRegister vin1, FloatRegister vin2, |
| FloatRegister vin3, FloatRegister vin4) { |
| if (dr < 36) { |
| __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); |
| } |
| __ addv(v5, __ T2D, vrc0, vin0); |
| __ ext(v6, __ T16B, vi2, vi3, 8); |
| __ ext(v5, __ T16B, v5, v5, 8); |
| __ ext(v7, __ T16B, vi1, vi2, 8); |
| __ addv(vi3, __ T2D, vi3, v5); |
| if (dr < 32) { |
| __ ext(v5, __ T16B, vin3, vin4, 8); |
| __ sha512su0(vin0, __ T2D, vin1); |
| } |
| __ sha512h(vi3, __ T2D, v6, v7); |
| if (dr < 32) { |
| __ sha512su1(vin0, __ T2D, vin2, v5); |
| } |
| __ addv(vi4, __ T2D, vi1, vi3); |
| __ sha512h2(vi3, __ T2D, vi1, vi0); |
| } |
| |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - byte[] source+offset |
| // c_rarg1 - int[] SHA.state |
| // c_rarg2 - int offset |
| // c_rarg3 - int limit |
| // |
| address generate_sha512_implCompress(bool multi_block, const char *name) { |
| static const uint64_t round_consts[80] = { |
| 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, |
| 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, |
| 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, |
| 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, |
| 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, |
| 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, |
| 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, |
| 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, |
| 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, |
| 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, |
| 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, |
| 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, |
| 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, |
| 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, |
| 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, |
| 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, |
| 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, |
| 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, |
| 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, |
| 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, |
| 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, |
| 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, |
| 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, |
| 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, |
| 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, |
| 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, |
| 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L |
| }; |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| |
| Register buf = c_rarg0; |
| Register state = c_rarg1; |
| Register ofs = c_rarg2; |
| Register limit = c_rarg3; |
| |
| __ stpd(v8, v9, __ pre(sp, -64)); |
| __ stpd(v10, v11, Address(sp, 16)); |
| __ stpd(v12, v13, Address(sp, 32)); |
| __ stpd(v14, v15, Address(sp, 48)); |
| |
| Label sha512_loop; |
| |
| // load state |
| __ ld1(v8, v9, v10, v11, __ T2D, state); |
| |
| // load first 4 round constants |
| __ lea(rscratch1, ExternalAddress((address)round_consts)); |
| __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); |
| |
| __ BIND(sha512_loop); |
| // load 128B of data into v12..v19 |
| __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); |
| __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); |
| __ rev64(v12, __ T16B, v12); |
| __ rev64(v13, __ T16B, v13); |
| __ rev64(v14, __ T16B, v14); |
| __ rev64(v15, __ T16B, v15); |
| __ rev64(v16, __ T16B, v16); |
| __ rev64(v17, __ T16B, v17); |
| __ rev64(v18, __ T16B, v18); |
| __ rev64(v19, __ T16B, v19); |
| |
| __ mov(rscratch2, rscratch1); |
| |
| __ mov(v0, __ T16B, v8); |
| __ mov(v1, __ T16B, v9); |
| __ mov(v2, __ T16B, v10); |
| __ mov(v3, __ T16B, v11); |
| |
| sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); |
| sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); |
| sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); |
| sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); |
| sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); |
| sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); |
| sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); |
| sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); |
| sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); |
| sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); |
| sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); |
| sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); |
| sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); |
| sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); |
| sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); |
| sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); |
| sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); |
| sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); |
| sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); |
| sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); |
| sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); |
| sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); |
| sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); |
| sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); |
| sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); |
| sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); |
| sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); |
| sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); |
| sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); |
| sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); |
| sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); |
| sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); |
| sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); |
| sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); |
| sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); |
| sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); |
| sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); |
| sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); |
| sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); |
| sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); |
| |
| __ addv(v8, __ T2D, v8, v0); |
| __ addv(v9, __ T2D, v9, v1); |
| __ addv(v10, __ T2D, v10, v2); |
| __ addv(v11, __ T2D, v11, v3); |
| |
| if (multi_block) { |
| __ add(ofs, ofs, 128); |
| __ cmp(ofs, limit); |
| __ br(Assembler::LE, sha512_loop); |
| __ mov(c_rarg0, ofs); // return ofs |
| } |
| |
| __ st1(v8, v9, v10, v11, __ T2D, state); |
| |
| __ ldpd(v14, v15, Address(sp, 48)); |
| __ ldpd(v12, v13, Address(sp, 32)); |
| __ ldpd(v10, v11, Address(sp, 16)); |
| __ ldpd(v8, v9, __ post(sp, 64)); |
| |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Arguments: |
| // |
| // Inputs: |
| // c_rarg0 - byte[] source+offset |
| // c_rarg1 - byte[] SHA.state |
| // c_rarg2 - int block_size |
| // c_rarg3 - int offset |
| // c_rarg4 - int limit |
| // |
| address generate_sha3_implCompress(bool multi_block, const char *name) { |
| static const uint64_t round_consts[24] = { |
| 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, |
| 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, |
| 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, |
| 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, |
| 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, |
| 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, |
| 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, |
| 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L |
| }; |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", name); |
| address start = __ pc(); |
| |
| Register buf = c_rarg0; |
| Register state = c_rarg1; |
| Register block_size = c_rarg2; |
| Register ofs = c_rarg3; |
| Register limit = c_rarg4; |
| |
| Label sha3_loop, rounds24_loop; |
| Label sha3_512_or_sha3_384, shake128; |
| |
| __ stpd(v8, v9, __ pre(sp, -64)); |
| __ stpd(v10, v11, Address(sp, 16)); |
| __ stpd(v12, v13, Address(sp, 32)); |
| __ stpd(v14, v15, Address(sp, 48)); |
| |
| // load state |
| __ add(rscratch1, state, 32); |
| __ ld1(v0, v1, v2, v3, __ T1D, state); |
| __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); |
| __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); |
| __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); |
| __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); |
| __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); |
| __ ld1(v24, __ T1D, rscratch1); |
| |
| __ BIND(sha3_loop); |
| |
| // 24 keccak rounds |
| __ movw(rscratch2, 24); |
| |
| // load round_constants base |
| __ lea(rscratch1, ExternalAddress((address) round_consts)); |
| |
| // load input |
| __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); |
| __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); |
| __ eor(v0, __ T8B, v0, v25); |
| __ eor(v1, __ T8B, v1, v26); |
| __ eor(v2, __ T8B, v2, v27); |
| __ eor(v3, __ T8B, v3, v28); |
| __ eor(v4, __ T8B, v4, v29); |
| __ eor(v5, __ T8B, v5, v30); |
| __ eor(v6, __ T8B, v6, v31); |
| |
| // block_size == 72, SHA3-512; block_size == 104, SHA3-384 |
| __ tbz(block_size, 7, sha3_512_or_sha3_384); |
| |
| __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); |
| __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); |
| __ eor(v7, __ T8B, v7, v25); |
| __ eor(v8, __ T8B, v8, v26); |
| __ eor(v9, __ T8B, v9, v27); |
| __ eor(v10, __ T8B, v10, v28); |
| __ eor(v11, __ T8B, v11, v29); |
| __ eor(v12, __ T8B, v12, v30); |
| __ eor(v13, __ T8B, v13, v31); |
| |
| __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); |
| __ eor(v14, __ T8B, v14, v25); |
| __ eor(v15, __ T8B, v15, v26); |
| __ eor(v16, __ T8B, v16, v27); |
| |
| // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 |
| __ andw(c_rarg5, block_size, 48); |
| __ cbzw(c_rarg5, rounds24_loop); |
| |
| __ tbnz(block_size, 5, shake128); |
| // block_size == 144, bit5 == 0, SHA3-244 |
| __ ldrd(v28, __ post(buf, 8)); |
| __ eor(v17, __ T8B, v17, v28); |
| __ b(rounds24_loop); |
| |
| __ BIND(shake128); |
| __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); |
| __ eor(v17, __ T8B, v17, v28); |
| __ eor(v18, __ T8B, v18, v29); |
| __ eor(v19, __ T8B, v19, v30); |
| __ eor(v20, __ T8B, v20, v31); |
| __ b(rounds24_loop); // block_size == 168, SHAKE128 |
| |
| __ BIND(sha3_512_or_sha3_384); |
| __ ld1(v25, v26, __ T8B, __ post(buf, 16)); |
| __ eor(v7, __ T8B, v7, v25); |
| __ eor(v8, __ T8B, v8, v26); |
| __ tbz(block_size, 5, rounds24_loop); // SHA3-512 |
| |
| // SHA3-384 |
| __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); |
| __ eor(v9, __ T8B, v9, v27); |
| __ eor(v10, __ T8B, v10, v28); |
| __ eor(v11, __ T8B, v11, v29); |
| __ eor(v12, __ T8B, v12, v30); |
| |
| __ BIND(rounds24_loop); |
| __ subw(rscratch2, rscratch2, 1); |
| |
| __ eor3(v29, __ T16B, v4, v9, v14); |
| __ eor3(v26, __ T16B, v1, v6, v11); |
| __ eor3(v28, __ T16B, v3, v8, v13); |
| __ eor3(v25, __ T16B, v0, v5, v10); |
| __ eor3(v27, __ T16B, v2, v7, v12); |
| __ eor3(v29, __ T16B, v29, v19, v24); |
| __ eor3(v26, __ T16B, v26, v16, v21); |
| __ eor3(v28, __ T16B, v28, v18, v23); |
| __ eor3(v25, __ T16B, v25, v15, v20); |
| __ eor3(v27, __ T16B, v27, v17, v22); |
| |
| __ rax1(v30, __ T2D, v29, v26); |
| __ rax1(v26, __ T2D, v26, v28); |
| __ rax1(v28, __ T2D, v28, v25); |
| __ rax1(v25, __ T2D, v25, v27); |
| __ rax1(v27, __ T2D, v27, v29); |
| |
| __ eor(v0, __ T16B, v0, v30); |
| __ xar(v29, __ T2D, v1, v25, (64 - 1)); |
| __ xar(v1, __ T2D, v6, v25, (64 - 44)); |
| __ xar(v6, __ T2D, v9, v28, (64 - 20)); |
| __ xar(v9, __ T2D, v22, v26, (64 - 61)); |
| __ xar(v22, __ T2D, v14, v28, (64 - 39)); |
| __ xar(v14, __ T2D, v20, v30, (64 - 18)); |
| __ xar(v31, __ T2D, v2, v26, (64 - 62)); |
| __ xar(v2, __ T2D, v12, v26, (64 - 43)); |
| __ xar(v12, __ T2D, v13, v27, (64 - 25)); |
| __ xar(v13, __ T2D, v19, v28, (64 - 8)); |
| __ xar(v19, __ T2D, v23, v27, (64 - 56)); |
| __ xar(v23, __ T2D, v15, v30, (64 - 41)); |
| __ xar(v15, __ T2D, v4, v28, (64 - 27)); |
| __ xar(v28, __ T2D, v24, v28, (64 - 14)); |
| __ xar(v24, __ T2D, v21, v25, (64 - 2)); |
| __ xar(v8, __ T2D, v8, v27, (64 - 55)); |
| __ xar(v4, __ T2D, v16, v25, (64 - 45)); |
| __ xar(v16, __ T2D, v5, v30, (64 - 36)); |
| __ xar(v5, __ T2D, v3, v27, (64 - 28)); |
| __ xar(v27, __ T2D, v18, v27, (64 - 21)); |
| __ xar(v3, __ T2D, v17, v26, (64 - 15)); |
| __ xar(v25, __ T2D, v11, v25, (64 - 10)); |
| __ xar(v26, __ T2D, v7, v26, (64 - 6)); |
| __ xar(v30, __ T2D, v10, v30, (64 - 3)); |
| |
| __ bcax(v20, __ T16B, v31, v22, v8); |
| __ bcax(v21, __ T16B, v8, v23, v22); |
| __ bcax(v22, __ T16B, v22, v24, v23); |
| __ bcax(v23, __ T16B, v23, v31, v24); |
| __ bcax(v24, __ T16B, v24, v8, v31); |
| |
| __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); |
| |
| __ bcax(v17, __ T16B, v25, v19, v3); |
| __ bcax(v18, __ T16B, v3, v15, v19); |
| __ bcax(v19, __ T16B, v19, v16, v15); |
| __ bcax(v15, __ T16B, v15, v25, v16); |
| __ bcax(v16, __ T16B, v16, v3, v25); |
| |
| __ bcax(v10, __ T16B, v29, v12, v26); |
| __ bcax(v11, __ T16B, v26, v13, v12); |
| __ bcax(v12, __ T16B, v12, v14, v13); |
| __ bcax(v13, __ T16B, v13, v29, v14); |
| __ bcax(v14, __ T16B, v14, v26, v29); |
| |
| __ bcax(v7, __ T16B, v30, v9, v4); |
| __ bcax(v8, __ T16B, v4, v5, v9); |
| __ bcax(v9, __ T16B, v9, v6, v5); |
| __ bcax(v5, __ T16B, v5, v30, v6); |
| __ bcax(v6, __ T16B, v6, v4, v30); |
| |
| __ bcax(v3, __ T16B, v27, v0, v28); |
| __ bcax(v4, __ T16B, v28, v1, v0); |
| __ bcax(v0, __ T16B, v0, v2, v1); |
| __ bcax(v1, __ T16B, v1, v27, v2); |
| __ bcax(v2, __ T16B, v2, v28, v27); |
| |
| __ eor(v0, __ T16B, v0, v31); |
| |
| __ cbnzw(rscratch2, rounds24_loop); |
| |
| if (multi_block) { |
| __ add(ofs, ofs, block_size); |
| __ cmp(ofs, limit); |
| __ br(Assembler::LE, sha3_loop); |
| __ mov(c_rarg0, ofs); // return ofs |
| } |
| |
| __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); |
| __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); |
| __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); |
| __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); |
| __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); |
| __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); |
| __ st1(v24, __ T1D, state); |
| |
| __ ldpd(v14, v15, Address(sp, 48)); |
| __ ldpd(v12, v13, Address(sp, 32)); |
| __ ldpd(v10, v11, Address(sp, 16)); |
| __ ldpd(v8, v9, __ post(sp, 64)); |
| |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| /** |
| * Arguments: |
| * |
| * Inputs: |
| * c_rarg0 - int crc |
| * c_rarg1 - byte* buf |
| * c_rarg2 - int length |
| * |
| * Output: |
| * rax - int crc result |
| */ |
| address generate_updateBytesCRC32() { |
| assert(UseCRC32Intrinsics, "what are we doing here?"); |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); |
| |
| address start = __ pc(); |
| |
| const Register crc = c_rarg0; // crc |
| const Register buf = c_rarg1; // source java byte array address |
| const Register len = c_rarg2; // length |
| const Register table0 = c_rarg3; // crc_table address |
| const Register table1 = c_rarg4; |
| const Register table2 = c_rarg5; |
| const Register table3 = c_rarg6; |
| const Register tmp3 = c_rarg7; |
| |
| BLOCK_COMMENT("Entry:"); |
| __ enter(); // required for proper stackwalking of RuntimeStub frame |
| |
| __ kernel_crc32(crc, buf, len, |
| table0, table1, table2, table3, rscratch1, rscratch2, tmp3); |
| |
| __ leave(); // required for proper stackwalking of RuntimeStub frame |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // ChaCha20 block function. This version parallelizes by loading |
| // individual 32-bit state elements into vectors for four blocks |
| // (e.g. all four blocks' worth of state[0] in one register, etc.) |
| // |
| // state (int[16]) = c_rarg0 |
| // keystream (byte[1024]) = c_rarg1 |
| // return - number of bytes of keystream (always 256) |
| address generate_chacha20Block_blockpar() { |
| Label L_twoRounds, L_cc20_const; |
| // The constant data is broken into two 128-bit segments to be loaded |
| // onto FloatRegisters. The first 128 bits are a counter add overlay |
| // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. |
| // The second 128-bits is a table constant used for 8-bit left rotations. |
| __ BIND(L_cc20_const); |
| __ emit_int64(0x0000000100000000UL); |
| __ emit_int64(0x0000000300000002UL); |
| __ emit_int64(0x0605040702010003UL); |
| __ emit_int64(0x0E0D0C0F0A09080BUL); |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "chacha20Block"); |
| address start = __ pc(); |
| __ enter(); |
| |
| int i, j; |
| const Register state = c_rarg0; |
| const Register keystream = c_rarg1; |
| const Register loopCtr = r10; |
| const Register tmpAddr = r11; |
| |
| const FloatRegister stateFirst = v0; |
| const FloatRegister stateSecond = v1; |
| const FloatRegister stateThird = v2; |
| const FloatRegister stateFourth = v3; |
| const FloatRegister origCtrState = v28; |
| const FloatRegister scratch = v29; |
| const FloatRegister lrot8Tbl = v30; |
| |
| // Organize SIMD registers in an array that facilitates |
| // putting repetitive opcodes into loop structures. It is |
| // important that each grouping of 4 registers is monotonically |
| // increasing to support the requirements of multi-register |
| // instructions (e.g. ld4r, st4, etc.) |
| const FloatRegister workSt[16] = { |
| v4, v5, v6, v7, v16, v17, v18, v19, |
| v20, v21, v22, v23, v24, v25, v26, v27 |
| }; |
| |
| // Load from memory and interlace across 16 SIMD registers, |
| // With each word from memory being broadcast to all lanes of |
| // each successive SIMD register. |
| // Addr(0) -> All lanes in workSt[i] |
| // Addr(4) -> All lanes workSt[i + 1], etc. |
| __ mov(tmpAddr, state); |
| for (i = 0; i < 16; i += 4) { |
| __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, |
| __ post(tmpAddr, 16)); |
| } |
| |
| // Pull in constant data. The first 16 bytes are the add overlay |
| // which is applied to the vector holding the counter (state[12]). |
| // The second 16 bytes is the index register for the 8-bit left |
| // rotation tbl instruction. |
| __ adr(tmpAddr, L_cc20_const); |
| __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); |
| __ addv(workSt[12], __ T4S, workSt[12], origCtrState); |
| |
| // Set up the 10 iteration loop and perform all 8 quarter round ops |
| __ mov(loopCtr, 10); |
| __ BIND(L_twoRounds); |
| |
| __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], |
| scratch, lrot8Tbl); |
| __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], |
| scratch, lrot8Tbl); |
| __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], |
| scratch, lrot8Tbl); |
| __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], |
| scratch, lrot8Tbl); |
| |
| __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], |
| scratch, lrot8Tbl); |
| __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], |
| scratch, lrot8Tbl); |
| __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], |
| scratch, lrot8Tbl); |
| __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], |
| scratch, lrot8Tbl); |
| |
| // Decrement and iterate |
| __ sub(loopCtr, loopCtr, 1); |
| __ cbnz(loopCtr, L_twoRounds); |
| |
| __ mov(tmpAddr, state); |
| |
| // Add the starting state back to the post-loop keystream |
| // state. We read/interlace the state array from memory into |
| // 4 registers similar to what we did in the beginning. Then |
| // add the counter overlay onto workSt[12] at the end. |
| for (i = 0; i < 16; i += 4) { |
| __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, |
| __ post(tmpAddr, 16)); |
| __ addv(workSt[i], __ T4S, workSt[i], stateFirst); |
| __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); |
| __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); |
| __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); |
| } |
| __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask |
| |
| // Write to key stream, storing the same element out of workSt[0..15] |
| // to consecutive 4-byte offsets in the key stream buffer, then repeating |
| // for the next element position. |
| for (i = 0; i < 4; i++) { |
| for (j = 0; j < 16; j += 4) { |
| __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, |
| __ post(keystream, 16)); |
| } |
| } |
| |
| __ mov(r0, 256); // Return length of output keystream |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| /** |
| * Arguments: |
| * |
| * Inputs: |
| * c_rarg0 - int crc |
| * c_rarg1 - byte* buf |
| * c_rarg2 - int length |
| * c_rarg3 - int* table |
| * |
| * Output: |
| * r0 - int crc result |
| */ |
| address generate_updateBytesCRC32C() { |
| assert(UseCRC32CIntrinsics, "what are we doing here?"); |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); |
| |
| address start = __ pc(); |
| |
| const Register crc = c_rarg0; // crc |
| const Register buf = c_rarg1; // source java byte array address |
| const Register len = c_rarg2; // length |
| const Register table0 = c_rarg3; // crc_table address |
| const Register table1 = c_rarg4; |
| const Register table2 = c_rarg5; |
| const Register table3 = c_rarg6; |
| const Register tmp3 = c_rarg7; |
| |
| BLOCK_COMMENT("Entry:"); |
| __ enter(); // required for proper stackwalking of RuntimeStub frame |
| |
| __ kernel_crc32c(crc, buf, len, |
| table0, table1, table2, table3, rscratch1, rscratch2, tmp3); |
| |
| __ leave(); // required for proper stackwalking of RuntimeStub frame |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| /*** |
| * Arguments: |
| * |
| * Inputs: |
| * c_rarg0 - int adler |
| * c_rarg1 - byte* buff |
| * c_rarg2 - int len |
| * |
| * Output: |
| * c_rarg0 - int adler result |
| */ |
| address generate_updateBytesAdler32() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); |
| address start = __ pc(); |
| |
| Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; |
| |
| // Aliases |
| Register adler = c_rarg0; |
| Register s1 = c_rarg0; |
| Register s2 = c_rarg3; |
| Register buff = c_rarg1; |
| Register len = c_rarg2; |
| Register nmax = r4; |
| Register base = r5; |
| Register count = r6; |
| Register temp0 = rscratch1; |
| Register temp1 = rscratch2; |
| FloatRegister vbytes = v0; |
| FloatRegister vs1acc = v1; |
| FloatRegister vs2acc = v2; |
| FloatRegister vtable = v3; |
| |
| // Max number of bytes we can process before having to take the mod |
| // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 |
| uint64_t BASE = 0xfff1; |
| uint64_t NMAX = 0x15B0; |
| |
| __ mov(base, BASE); |
| __ mov(nmax, NMAX); |
| |
| // Load accumulation coefficients for the upper 16 bits |
| __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); |
| __ ld1(vtable, __ T16B, Address(temp0)); |
| |
| // s1 is initialized to the lower 16 bits of adler |
| // s2 is initialized to the upper 16 bits of adler |
| __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) |
| __ uxth(s1, adler); // s1 = (adler & 0xffff) |
| |
| // The pipelined loop needs at least 16 elements for 1 iteration |
| // It does check this, but it is more effective to skip to the cleanup loop |
| __ cmp(len, (u1)16); |
| __ br(Assembler::HS, L_nmax); |
| __ cbz(len, L_combine); |
| |
| __ bind(L_simple_by1_loop); |
| __ ldrb(temp0, Address(__ post(buff, 1))); |
| __ add(s1, s1, temp0); |
| __ add(s2, s2, s1); |
| __ subs(len, len, 1); |
| __ br(Assembler::HI, L_simple_by1_loop); |
| |
| // s1 = s1 % BASE |
| __ subs(temp0, s1, base); |
| __ csel(s1, temp0, s1, Assembler::HS); |
| |
| // s2 = s2 % BASE |
| __ lsr(temp0, s2, 16); |
| __ lsl(temp1, temp0, 4); |
| __ sub(temp1, temp1, temp0); |
| __ add(s2, temp1, s2, ext::uxth); |
| |
| __ subs(temp0, s2, base); |
| __ csel(s2, temp0, s2, Assembler::HS); |
| |
| __ b(L_combine); |
| |
| __ bind(L_nmax); |
| __ subs(len, len, nmax); |
| __ sub(count, nmax, 16); |
| __ br(Assembler::LO, L_by16); |
| |
| __ bind(L_nmax_loop); |
| |
| generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, |
| vbytes, vs1acc, vs2acc, vtable); |
| |
| __ subs(count, count, 16); |
| __ br(Assembler::HS, L_nmax_loop); |
| |
| // s1 = s1 % BASE |
| __ lsr(temp0, s1, 16); |
| __ lsl(temp1, temp0, 4); |
| __ sub(temp1, temp1, temp0); |
| __ add(temp1, temp1, s1, ext::uxth); |
| |
| __ lsr(temp0, temp1, 16); |
| __ lsl(s1, temp0, 4); |
| __ sub(s1, s1, temp0); |
| __ add(s1, s1, temp1, ext:: uxth); |
| |
| __ subs(temp0, s1, base); |
| __ csel(s1, temp0, s1, Assembler::HS); |
| |
| // s2 = s2 % BASE |
| __ lsr(temp0, s2, 16); |
| __ lsl(temp1, temp0, 4); |
| __ sub(temp1, temp1, temp0); |
| __ add(temp1, temp1, s2, ext::uxth); |
| |
| __ lsr(temp0, temp1, 16); |
| __ lsl(s2, temp0, 4); |
| __ sub(s2, s2, temp0); |
| __ add(s2, s2, temp1, ext:: uxth); |
| |
| __ subs(temp0, s2, base); |
| __ csel(s2, temp0, s2, Assembler::HS); |
| |
| __ subs(len, len, nmax); |
| __ sub(count, nmax, 16); |
| __ br(Assembler::HS, L_nmax_loop); |
| |
| __ bind(L_by16); |
| __ adds(len, len, count); |
| __ br(Assembler::LO, L_by1); |
| |
| __ bind(L_by16_loop); |
| |
| generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, |
| vbytes, vs1acc, vs2acc, vtable); |
| |
| __ subs(len, len, 16); |
| __ br(Assembler::HS, L_by16_loop); |
| |
| __ bind(L_by1); |
| __ adds(len, len, 15); |
| __ br(Assembler::LO, L_do_mod); |
| |
| __ bind(L_by1_loop); |
| __ ldrb(temp0, Address(__ post(buff, 1))); |
| __ add(s1, temp0, s1); |
| __ add(s2, s2, s1); |
| __ subs(len, len, 1); |
| __ br(Assembler::HS, L_by1_loop); |
| |
| __ bind(L_do_mod); |
| // s1 = s1 % BASE |
| __ lsr(temp0, s1, 16); |
| __ lsl(temp1, temp0, 4); |
| __ sub(temp1, temp1, temp0); |
| __ add(temp1, temp1, s1, ext::uxth); |
| |
| __ lsr(temp0, temp1, 16); |
| __ lsl(s1, temp0, 4); |
| __ sub(s1, s1, temp0); |
| __ add(s1, s1, temp1, ext:: uxth); |
| |
| __ subs(temp0, s1, base); |
| __ csel(s1, temp0, s1, Assembler::HS); |
| |
| // s2 = s2 % BASE |
| __ lsr(temp0, s2, 16); |
| __ lsl(temp1, temp0, 4); |
| __ sub(temp1, temp1, temp0); |
| __ add(temp1, temp1, s2, ext::uxth); |
| |
| __ lsr(temp0, temp1, 16); |
| __ lsl(s2, temp0, 4); |
| __ sub(s2, s2, temp0); |
| __ add(s2, s2, temp1, ext:: uxth); |
| |
| __ subs(temp0, s2, base); |
| __ csel(s2, temp0, s2, Assembler::HS); |
| |
| // Combine lower bits and higher bits |
| __ bind(L_combine); |
| __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) |
| |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, |
| Register temp0, Register temp1, FloatRegister vbytes, |
| FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { |
| // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. |
| // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. |
| // In non-vectorized code, we update s1 and s2 as: |
| // s1 <- s1 + b1 |
| // s2 <- s2 + s1 |
| // s1 <- s1 + b2 |
| // s2 <- s2 + b1 |
| // ... |
| // s1 <- s1 + b16 |
| // s2 <- s2 + s1 |
| // Putting above assignments together, we have: |
| // s1_new = s1 + b1 + b2 + ... + b16 |
| // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) |
| // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) |
| // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) |
| __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); |
| |
| // s2 = s2 + s1 * 16 |
| __ add(s2, s2, s1, Assembler::LSL, 4); |
| |
| // vs1acc = b1 + b2 + b3 + ... + b16 |
| // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) |
| __ umullv(vs2acc, __ T8B, vtable, vbytes); |
| __ umlalv(vs2acc, __ T16B, vtable, vbytes); |
| __ uaddlv(vs1acc, __ T16B, vbytes); |
| __ uaddlv(vs2acc, __ T8H, vs2acc); |
| |
| // s1 = s1 + vs1acc, s2 = s2 + vs2acc |
| __ fmovd(temp0, vs1acc); |
| __ fmovd(temp1, vs2acc); |
| __ add(s1, s1, temp0); |
| __ add(s2, s2, temp1); |
| } |
| |
| /** |
| * Arguments: |
| * |
| * Input: |
| * c_rarg0 - x address |
| * c_rarg1 - x length |
| * c_rarg2 - y address |
| * c_rarg3 - y length |
| * c_rarg4 - z address |
| * c_rarg5 - z length |
| */ |
| address generate_multiplyToLen() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); |
| |
| address start = __ pc(); |
| const Register x = r0; |
| const Register xlen = r1; |
| const Register y = r2; |
| const Register ylen = r3; |
| const Register z = r4; |
| const Register zlen = r5; |
| |
| const Register tmp1 = r10; |
| const Register tmp2 = r11; |
| const Register tmp3 = r12; |
| const Register tmp4 = r13; |
| const Register tmp5 = r14; |
| const Register tmp6 = r15; |
| const Register tmp7 = r16; |
| |
| BLOCK_COMMENT("Entry:"); |
| __ enter(); // required for proper stackwalking of RuntimeStub frame |
| __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); |
| __ leave(); // required for proper stackwalking of RuntimeStub frame |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| address generate_squareToLen() { |
| // squareToLen algorithm for sizes 1..127 described in java code works |
| // faster than multiply_to_len on some CPUs and slower on others, but |
| // multiply_to_len shows a bit better overall results |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "squareToLen"); |
| address start = __ pc(); |
| |
| const Register x = r0; |
| const Register xlen = r1; |
| const Register z = r2; |
| const Register zlen = r3; |
| const Register y = r4; // == x |
| const Register ylen = r5; // == xlen |
| |
| const Register tmp1 = r10; |
| const Register tmp2 = r11; |
| const Register tmp3 = r12; |
| const Register tmp4 = r13; |
| const Register tmp5 = r14; |
| const Register tmp6 = r15; |
| const Register tmp7 = r16; |
| |
| RegSet spilled_regs = RegSet::of(y, ylen); |
| BLOCK_COMMENT("Entry:"); |
| __ enter(); |
| __ push(spilled_regs, sp); |
| __ mov(y, x); |
| __ mov(ylen, xlen); |
| __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); |
| __ pop(spilled_regs, sp); |
| __ leave(); |
| __ ret(lr); |
| return start; |
| } |
| |
| address generate_mulAdd() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "mulAdd"); |
| |
| address start = __ pc(); |
| |
| const Register out = r0; |
| const Register in = r1; |
| const Register offset = r2; |
| const Register len = r3; |
| const Register k = r4; |
| |
| BLOCK_COMMENT("Entry:"); |
| __ enter(); |
| __ mul_add(out, in, offset, len, k); |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Arguments: |
| // |
| // Input: |
| // c_rarg0 - newArr address |
| // c_rarg1 - oldArr address |
| // c_rarg2 - newIdx |
| // c_rarg3 - shiftCount |
| // c_rarg4 - numIter |
| // |
| address generate_bigIntegerRightShift() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); |
| address start = __ pc(); |
| |
| Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; |
| |
| Register newArr = c_rarg0; |
| Register oldArr = c_rarg1; |
| Register newIdx = c_rarg2; |
| Register shiftCount = c_rarg3; |
| Register numIter = c_rarg4; |
| Register idx = numIter; |
| |
| Register newArrCur = rscratch1; |
| Register shiftRevCount = rscratch2; |
| Register oldArrCur = r13; |
| Register oldArrNext = r14; |
| |
| FloatRegister oldElem0 = v0; |
| FloatRegister oldElem1 = v1; |
| FloatRegister newElem = v2; |
| FloatRegister shiftVCount = v3; |
| FloatRegister shiftVRevCount = v4; |
| |
| __ cbz(idx, Exit); |
| |
| __ add(newArr, newArr, newIdx, Assembler::LSL, 2); |
| |
| // left shift count |
| __ movw(shiftRevCount, 32); |
| __ subw(shiftRevCount, shiftRevCount, shiftCount); |
| |
| // numIter too small to allow a 4-words SIMD loop, rolling back |
| __ cmp(numIter, (u1)4); |
| __ br(Assembler::LT, ShiftThree); |
| |
| __ dup(shiftVCount, __ T4S, shiftCount); |
| __ dup(shiftVRevCount, __ T4S, shiftRevCount); |
| __ negr(shiftVCount, __ T4S, shiftVCount); |
| |
| __ BIND(ShiftSIMDLoop); |
| |
| // Calculate the load addresses |
| __ sub(idx, idx, 4); |
| __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); |
| __ add(newArrCur, newArr, idx, Assembler::LSL, 2); |
| __ add(oldArrCur, oldArrNext, 4); |
| |
| // Load 4 words and process |
| __ ld1(oldElem0, __ T4S, Address(oldArrCur)); |
| __ ld1(oldElem1, __ T4S, Address(oldArrNext)); |
| __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); |
| __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); |
| __ orr(newElem, __ T16B, oldElem0, oldElem1); |
| __ st1(newElem, __ T4S, Address(newArrCur)); |
| |
| __ cmp(idx, (u1)4); |
| __ br(Assembler::LT, ShiftTwoLoop); |
| __ b(ShiftSIMDLoop); |
| |
| __ BIND(ShiftTwoLoop); |
| __ cbz(idx, Exit); |
| __ cmp(idx, (u1)1); |
| __ br(Assembler::EQ, ShiftOne); |
| |
| // Calculate the load addresses |
| __ sub(idx, idx, 2); |
| __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); |
| __ add(newArrCur, newArr, idx, Assembler::LSL, 2); |
| __ add(oldArrCur, oldArrNext, 4); |
| |
| // Load 2 words and process |
| __ ld1(oldElem0, __ T2S, Address(oldArrCur)); |
| __ ld1(oldElem1, __ T2S, Address(oldArrNext)); |
| __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); |
| __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); |
| __ orr(newElem, __ T8B, oldElem0, oldElem1); |
| __ st1(newElem, __ T2S, Address(newArrCur)); |
| __ b(ShiftTwoLoop); |
| |
| __ BIND(ShiftThree); |
| __ tbz(idx, 1, ShiftOne); |
| __ tbz(idx, 0, ShiftTwo); |
| __ ldrw(r10, Address(oldArr, 12)); |
| __ ldrw(r11, Address(oldArr, 8)); |
| __ lsrvw(r10, r10, shiftCount); |
| __ lslvw(r11, r11, shiftRevCount); |
| __ orrw(r12, r10, r11); |
| __ strw(r12, Address(newArr, 8)); |
| |
| __ BIND(ShiftTwo); |
| __ ldrw(r10, Address(oldArr, 8)); |
| __ ldrw(r11, Address(oldArr, 4)); |
| __ lsrvw(r10, r10, shiftCount); |
| __ lslvw(r11, r11, shiftRevCount); |
| __ orrw(r12, r10, r11); |
| __ strw(r12, Address(newArr, 4)); |
| |
| __ BIND(ShiftOne); |
| __ ldrw(r10, Address(oldArr, 4)); |
| __ ldrw(r11, Address(oldArr)); |
| __ lsrvw(r10, r10, shiftCount); |
| __ lslvw(r11, r11, shiftRevCount); |
| __ orrw(r12, r10, r11); |
| __ strw(r12, Address(newArr)); |
| |
| __ BIND(Exit); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Arguments: |
| // |
| // Input: |
| // c_rarg0 - newArr address |
| // c_rarg1 - oldArr address |
| // c_rarg2 - newIdx |
| // c_rarg3 - shiftCount |
| // c_rarg4 - numIter |
| // |
| address generate_bigIntegerLeftShift() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); |
| address start = __ pc(); |
| |
| Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; |
| |
| Register newArr = c_rarg0; |
| Register oldArr = c_rarg1; |
| Register newIdx = c_rarg2; |
| Register shiftCount = c_rarg3; |
| Register numIter = c_rarg4; |
| |
| Register shiftRevCount = rscratch1; |
| Register oldArrNext = rscratch2; |
| |
| FloatRegister oldElem0 = v0; |
| FloatRegister oldElem1 = v1; |
| FloatRegister newElem = v2; |
| FloatRegister shiftVCount = v3; |
| FloatRegister shiftVRevCount = v4; |
| |
| __ cbz(numIter, Exit); |
| |
| __ add(oldArrNext, oldArr, 4); |
| __ add(newArr, newArr, newIdx, Assembler::LSL, 2); |
| |
| // right shift count |
| __ movw(shiftRevCount, 32); |
| __ subw(shiftRevCount, shiftRevCount, shiftCount); |
| |
| // numIter too small to allow a 4-words SIMD loop, rolling back |
| __ cmp(numIter, (u1)4); |
| __ br(Assembler::LT, ShiftThree); |
| |
| __ dup(shiftVCount, __ T4S, shiftCount); |
| __ dup(shiftVRevCount, __ T4S, shiftRevCount); |
| __ negr(shiftVRevCount, __ T4S, shiftVRevCount); |
| |
| __ BIND(ShiftSIMDLoop); |
| |
| // load 4 words and process |
| __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); |
| __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); |
| __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); |
| __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); |
| __ orr(newElem, __ T16B, oldElem0, oldElem1); |
| __ st1(newElem, __ T4S, __ post(newArr, 16)); |
| __ sub(numIter, numIter, 4); |
| |
| __ cmp(numIter, (u1)4); |
| __ br(Assembler::LT, ShiftTwoLoop); |
| __ b(ShiftSIMDLoop); |
| |
| __ BIND(ShiftTwoLoop); |
| __ cbz(numIter, Exit); |
| __ cmp(numIter, (u1)1); |
| __ br(Assembler::EQ, ShiftOne); |
| |
| // load 2 words and process |
| __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); |
| __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); |
| __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); |
| __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); |
| __ orr(newElem, __ T8B, oldElem0, oldElem1); |
| __ st1(newElem, __ T2S, __ post(newArr, 8)); |
| __ sub(numIter, numIter, 2); |
| __ b(ShiftTwoLoop); |
| |
| __ BIND(ShiftThree); |
| __ ldrw(r10, __ post(oldArr, 4)); |
| __ ldrw(r11, __ post(oldArrNext, 4)); |
| __ lslvw(r10, r10, shiftCount); |
| __ lsrvw(r11, r11, shiftRevCount); |
| __ orrw(r12, r10, r11); |
| __ strw(r12, __ post(newArr, 4)); |
| __ tbz(numIter, 1, Exit); |
| __ tbz(numIter, 0, ShiftOne); |
| |
| __ BIND(ShiftTwo); |
| __ ldrw(r10, __ post(oldArr, 4)); |
| __ ldrw(r11, __ post(oldArrNext, 4)); |
| __ lslvw(r10, r10, shiftCount); |
| __ lsrvw(r11, r11, shiftRevCount); |
| __ orrw(r12, r10, r11); |
| __ strw(r12, __ post(newArr, 4)); |
| |
| __ BIND(ShiftOne); |
| __ ldrw(r10, Address(oldArr)); |
| __ ldrw(r11, Address(oldArrNext)); |
| __ lslvw(r10, r10, shiftCount); |
| __ lsrvw(r11, r11, shiftRevCount); |
| __ orrw(r12, r10, r11); |
| __ strw(r12, Address(newArr)); |
| |
| __ BIND(Exit); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| address generate_count_positives(address &count_positives_long) { |
| const u1 large_loop_size = 64; |
| const uint64_t UPPER_BIT_MASK=0x8080808080808080; |
| int dcache_line = VM_Version::dcache_line_size(); |
| |
| Register ary1 = r1, len = r2, result = r0; |
| |
| __ align(CodeEntryAlignment); |
| |
| StubCodeMark mark(this, "StubRoutines", "count_positives"); |
| |
| address entry = __ pc(); |
| |
| __ enter(); |
| // precondition: a copy of len is already in result |
| // __ mov(result, len); |
| |
| Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, |
| LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; |
| |
| __ cmp(len, (u1)15); |
| __ br(Assembler::GT, LEN_OVER_15); |
| // The only case when execution falls into this code is when pointer is near |
| // the end of memory page and we have to avoid reading next page |
| __ add(ary1, ary1, len); |
| __ subs(len, len, 8); |
| __ br(Assembler::GT, LEN_OVER_8); |
| __ ldr(rscratch2, Address(ary1, -8)); |
| __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. |
| __ lsrv(rscratch2, rscratch2, rscratch1); |
| __ tst(rscratch2, UPPER_BIT_MASK); |
| __ csel(result, zr, result, Assembler::NE); |
| __ leave(); |
| __ ret(lr); |
| __ bind(LEN_OVER_8); |
| __ ldp(rscratch1, rscratch2, Address(ary1, -16)); |
| __ sub(len, len, 8); // no data dep., then sub can be executed while loading |
| __ tst(rscratch2, UPPER_BIT_MASK); |
| __ br(Assembler::NE, RET_NO_POP); |
| __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes |
| __ lsrv(rscratch1, rscratch1, rscratch2); |
| __ tst(rscratch1, UPPER_BIT_MASK); |
| __ bind(RET_NO_POP); |
| __ csel(result, zr, result, Assembler::NE); |
| __ leave(); |
| __ ret(lr); |
| |
| Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; |
| const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; |
| |
| count_positives_long = __ pc(); // 2nd entry point |
| |
| __ enter(); |
| |
| __ bind(LEN_OVER_15); |
| __ push(spilled_regs, sp); |
| __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment |
| __ cbz(rscratch2, ALIGNED); |
| __ ldp(tmp6, tmp1, Address(ary1)); |
| __ mov(tmp5, 16); |
| __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address |
| __ add(ary1, ary1, rscratch1); |
| __ orr(tmp6, tmp6, tmp1); |
| __ tst(tmp6, UPPER_BIT_MASK); |
| __ br(Assembler::NE, RET_ADJUST); |
| __ sub(len, len, rscratch1); |
| |
| __ bind(ALIGNED); |
| __ cmp(len, large_loop_size); |
| __ br(Assembler::LT, CHECK_16); |
| // Perform 16-byte load as early return in pre-loop to handle situation |
| // when initially aligned large array has negative values at starting bytes, |
| // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is |
| // slower. Cases with negative bytes further ahead won't be affected that |
| // much. In fact, it'll be faster due to early loads, less instructions and |
| // less branches in LARGE_LOOP. |
| __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); |
| __ sub(len, len, 16); |
| __ orr(tmp6, tmp6, tmp1); |
| __ tst(tmp6, UPPER_BIT_MASK); |
| __ br(Assembler::NE, RET_ADJUST_16); |
| __ cmp(len, large_loop_size); |
| __ br(Assembler::LT, CHECK_16); |
| |
| if (SoftwarePrefetchHintDistance >= 0 |
| && SoftwarePrefetchHintDistance >= dcache_line) { |
| // initial prefetch |
| __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); |
| } |
| __ bind(LARGE_LOOP); |
| if (SoftwarePrefetchHintDistance >= 0) { |
| __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); |
| } |
| // Issue load instructions first, since it can save few CPU/MEM cycles, also |
| // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) |
| // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 |
| // instructions per cycle and have less branches, but this approach disables |
| // early return, thus, all 64 bytes are loaded and checked every time. |
| __ ldp(tmp2, tmp3, Address(ary1)); |
| __ ldp(tmp4, tmp5, Address(ary1, 16)); |
| __ ldp(rscratch1, rscratch2, Address(ary1, 32)); |
| __ ldp(tmp6, tmp1, Address(ary1, 48)); |
| __ add(ary1, ary1, large_loop_size); |
| __ sub(len, len, large_loop_size); |
| __ orr(tmp2, tmp2, tmp3); |
| __ orr(tmp4, tmp4, tmp5); |
| __ orr(rscratch1, rscratch1, rscratch2); |
| __ orr(tmp6, tmp6, tmp1); |
| __ orr(tmp2, tmp2, tmp4); |
| __ orr(rscratch1, rscratch1, tmp6); |
| __ orr(tmp2, tmp2, rscratch1); |
| __ tst(tmp2, UPPER_BIT_MASK); |
| __ br(Assembler::NE, RET_ADJUST_LONG); |
| __ cmp(len, large_loop_size); |
| __ br(Assembler::GE, LARGE_LOOP); |
| |
| __ bind(CHECK_16); // small 16-byte load pre-loop |
| __ cmp(len, (u1)16); |
| __ br(Assembler::LT, POST_LOOP16); |
| |
| __ bind(LOOP16); // small 16-byte load loop |
| __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); |
| __ sub(len, len, 16); |
| __ orr(tmp2, tmp2, tmp3); |
| __ tst(tmp2, UPPER_BIT_MASK); |
| __ br(Assembler::NE, RET_ADJUST_16); |
| __ cmp(len, (u1)16); |
| __ br(Assembler::GE, LOOP16); // 16-byte load loop end |
| |
| __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally |
| __ cmp(len, (u1)8); |
| __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); |
| __ ldr(tmp3, Address(__ post(ary1, 8))); |
| __ tst(tmp3, UPPER_BIT_MASK); |
| __ br(Assembler::NE, RET_ADJUST); |
| __ sub(len, len, 8); |
| |
| __ bind(POST_LOOP16_LOAD_TAIL); |
| __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 |
| __ ldr(tmp1, Address(ary1)); |
| __ mov(tmp2, 64); |
| __ sub(tmp4, tmp2, len, __ LSL, 3); |
| __ lslv(tmp1, tmp1, tmp4); |
| __ tst(tmp1, UPPER_BIT_MASK); |
| __ br(Assembler::NE, RET_ADJUST); |
| // Fallthrough |
| |
| __ bind(RET_LEN); |
| __ pop(spilled_regs, sp); |
| __ leave(); |
| __ ret(lr); |
| |
| // difference result - len is the count of guaranteed to be |
| // positive bytes |
| |
| __ bind(RET_ADJUST_LONG); |
| __ add(len, len, (u1)(large_loop_size - 16)); |
| __ bind(RET_ADJUST_16); |
| __ add(len, len, 16); |
| __ bind(RET_ADJUST); |
| __ pop(spilled_regs, sp); |
| __ leave(); |
| __ sub(result, result, len); |
| __ ret(lr); |
| |
| return entry; |
| } |
| |
| void generate_large_array_equals_loop_nonsimd(int loopThreshold, |
| bool usePrefetch, Label &NOT_EQUAL) { |
| Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, |
| tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, |
| tmp7 = r12, tmp8 = r13; |
| Label LOOP; |
| |
| __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); |
| __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); |
| __ bind(LOOP); |
| if (usePrefetch) { |
| __ prfm(Address(a1, SoftwarePrefetchHintDistance)); |
| __ prfm(Address(a2, SoftwarePrefetchHintDistance)); |
| } |
| __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); |
| __ eor(tmp1, tmp1, tmp2); |
| __ eor(tmp3, tmp3, tmp4); |
| __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); |
| __ orr(tmp1, tmp1, tmp3); |
| __ cbnz(tmp1, NOT_EQUAL); |
| __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); |
| __ eor(tmp5, tmp5, tmp6); |
| __ eor(tmp7, tmp7, tmp8); |
| __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); |
| __ orr(tmp5, tmp5, tmp7); |
| __ cbnz(tmp5, NOT_EQUAL); |
| __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); |
| __ eor(tmp1, tmp1, tmp2); |
| __ eor(tmp3, tmp3, tmp4); |
| __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); |
| __ orr(tmp1, tmp1, tmp3); |
| __ cbnz(tmp1, NOT_EQUAL); |
| __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); |
| __ eor(tmp5, tmp5, tmp6); |
| __ sub(cnt1, cnt1, 8 * wordSize); |
| __ eor(tmp7, tmp7, tmp8); |
| __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); |
| // tmp6 is not used. MacroAssembler::subs is used here (rather than |
| // cmp) because subs allows an unlimited range of immediate operand. |
| __ subs(tmp6, cnt1, loopThreshold); |
| __ orr(tmp5, tmp5, tmp7); |
| __ cbnz(tmp5, NOT_EQUAL); |
| __ br(__ GE, LOOP); |
| // post-loop |
| __ eor(tmp1, tmp1, tmp2); |
| __ eor(tmp3, tmp3, tmp4); |
| __ orr(tmp1, tmp1, tmp3); |
| __ sub(cnt1, cnt1, 2 * wordSize); |
| __ cbnz(tmp1, NOT_EQUAL); |
| } |
| |
| void generate_large_array_equals_loop_simd(int loopThreshold, |
| bool usePrefetch, Label &NOT_EQUAL) { |
| Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, |
| tmp2 = rscratch2; |
| Label LOOP; |
| |
| __ bind(LOOP); |
| if (usePrefetch) { |
| __ prfm(Address(a1, SoftwarePrefetchHintDistance)); |
| __ prfm(Address(a2, SoftwarePrefetchHintDistance)); |
| } |
| __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); |
| __ sub(cnt1, cnt1, 8 * wordSize); |
| __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); |
| __ subs(tmp1, cnt1, loopThreshold); |
| __ eor(v0, __ T16B, v0, v4); |
| __ eor(v1, __ T16B, v1, v5); |
| __ eor(v2, __ T16B, v2, v6); |
| __ eor(v3, __ T16B, v3, v7); |
| __ orr(v0, __ T16B, v0, v1); |
| __ orr(v1, __ T16B, v2, v3); |
| __ orr(v0, __ T16B, v0, v1); |
| __ umov(tmp1, v0, __ D, 0); |
| __ umov(tmp2, v0, __ D, 1); |
| __ orr(tmp1, tmp1, tmp2); |
| __ cbnz(tmp1, NOT_EQUAL); |
| __ br(__ GE, LOOP); |
| } |
| |
| // a1 = r1 - array1 address |
| // a2 = r2 - array2 address |
| // result = r0 - return value. Already contains "false" |
| // cnt1 = r10 - amount of elements left to check, reduced by wordSize |
| // r3-r5 are reserved temporary registers |
| // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 |
| address generate_large_array_equals() { |
| Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, |
| tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, |
| tmp7 = r12, tmp8 = r13; |
| Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, |
| SMALL_LOOP, POST_LOOP; |
| const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; |
| // calculate if at least 32 prefetched bytes are used |
| int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; |
| int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); |
| RegSet spilled_regs = RegSet::range(tmp6, tmp8); |
| assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, |
| tmp5, tmp6, tmp7, tmp8); |
| |
| __ align(CodeEntryAlignment); |
| |
| StubCodeMark mark(this, "StubRoutines", "large_array_equals"); |
| |
| address entry = __ pc(); |
| __ enter(); |
| __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub |
| // also advance pointers to use post-increment instead of pre-increment |
| __ add(a1, a1, wordSize); |
| __ add(a2, a2, wordSize); |
| if (AvoidUnalignedAccesses) { |
| // both implementations (SIMD/nonSIMD) are using relatively large load |
| // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) |
| // on some CPUs in case of address is not at least 16-byte aligned. |
| // Arrays are 8-byte aligned currently, so, we can make additional 8-byte |
| // load if needed at least for 1st address and make if 16-byte aligned. |
| Label ALIGNED16; |
| __ tbz(a1, 3, ALIGNED16); |
| __ ldr(tmp1, Address(__ post(a1, wordSize))); |
| __ ldr(tmp2, Address(__ post(a2, wordSize))); |
| __ sub(cnt1, cnt1, wordSize); |
| __ eor(tmp1, tmp1, tmp2); |
| __ cbnz(tmp1, NOT_EQUAL_NO_POP); |
| __ bind(ALIGNED16); |
| } |
| if (UseSIMDForArrayEquals) { |
| if (SoftwarePrefetchHintDistance >= 0) { |
| __ subs(tmp1, cnt1, prefetchLoopThreshold); |
| __ br(__ LE, NO_PREFETCH_LARGE_LOOP); |
| generate_large_array_equals_loop_simd(prefetchLoopThreshold, |
| /* prfm = */ true, NOT_EQUAL); |
| __ subs(zr, cnt1, nonPrefetchLoopThreshold); |
| __ br(__ LT, TAIL); |
| } |
| __ bind(NO_PREFETCH_LARGE_LOOP); |
| generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, |
| /* prfm = */ false, NOT_EQUAL); |
| } else { |
| __ push(spilled_regs, sp); |
| if (SoftwarePrefetchHintDistance >= 0) { |
| __ subs(tmp1, cnt1, prefetchLoopThreshold); |
| __ br(__ LE, NO_PREFETCH_LARGE_LOOP); |
| generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, |
| /* prfm = */ true, NOT_EQUAL); |
| __ subs(zr, cnt1, nonPrefetchLoopThreshold); |
| __ br(__ LT, TAIL); |
| } |
| __ bind(NO_PREFETCH_LARGE_LOOP); |
| generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, |
| /* prfm = */ false, NOT_EQUAL); |
| } |
| __ bind(TAIL); |
| __ cbz(cnt1, EQUAL); |
| __ subs(cnt1, cnt1, wordSize); |
| __ br(__ LE, POST_LOOP); |
| __ bind(SMALL_LOOP); |
| __ ldr(tmp1, Address(__ post(a1, wordSize))); |
| __ ldr(tmp2, Address(__ post(a2, wordSize))); |
| __ subs(cnt1, cnt1, wordSize); |
| __ eor(tmp1, tmp1, tmp2); |
| __ cbnz(tmp1, NOT_EQUAL); |
| __ br(__ GT, SMALL_LOOP); |
| __ bind(POST_LOOP); |
| __ ldr(tmp1, Address(a1, cnt1)); |
| __ ldr(tmp2, Address(a2, cnt1)); |
| __ eor(tmp1, tmp1, tmp2); |
| __ cbnz(tmp1, NOT_EQUAL); |
| __ bind(EQUAL); |
| __ mov(result, true); |
| __ bind(NOT_EQUAL); |
| if (!UseSIMDForArrayEquals) { |
| __ pop(spilled_regs, sp); |
| } |
| __ bind(NOT_EQUAL_NO_POP); |
| __ leave(); |
| __ ret(lr); |
| return entry; |
| } |
| |
| address generate_dsin_dcos(bool isCos) { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); |
| address start = __ pc(); |
| __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, |
| (address)StubRoutines::aarch64::_two_over_pi, |
| (address)StubRoutines::aarch64::_pio2, |
| (address)StubRoutines::aarch64::_dsin_coef, |
| (address)StubRoutines::aarch64::_dcos_coef); |
| return start; |
| } |
| |
| address generate_dlog() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "dlog"); |
| address entry = __ pc(); |
| FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, |
| vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; |
| Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; |
| __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, |
| tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); |
| return entry; |
| } |
| |
| |
| // code for comparing 16 characters of strings with Latin1 and Utf16 encoding |
| void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, |
| Label &DIFF2) { |
| Register cnt1 = r2, tmp2 = r11, tmp3 = r12; |
| FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; |
| |
| __ ldrq(vtmp, Address(__ post(tmp2, 16))); |
| __ ldr(tmpU, Address(__ post(cnt1, 8))); |
| __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); |
| // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 |
| |
| __ fmovd(tmpL, vtmp3); |
| __ eor(rscratch2, tmp3, tmpL); |
| __ cbnz(rscratch2, DIFF2); |
| |
| __ ldr(tmp3, Address(__ post(cnt1, 8))); |
| __ umov(tmpL, vtmp3, __ D, 1); |
| __ eor(rscratch2, tmpU, tmpL); |
| __ cbnz(rscratch2, DIFF1); |
| |
| __ zip2(vtmp, __ T16B, vtmp, vtmpZ); |
| __ ldr(tmpU, Address(__ post(cnt1, 8))); |
| __ fmovd(tmpL, vtmp); |
| __ eor(rscratch2, tmp3, tmpL); |
| __ cbnz(rscratch2, DIFF2); |
| |
| __ ldr(tmp3, Address(__ post(cnt1, 8))); |
| __ umov(tmpL, vtmp, __ D, 1); |
| __ eor(rscratch2, tmpU, tmpL); |
| __ cbnz(rscratch2, DIFF1); |
| } |
| |
| // r0 = result |
| // r1 = str1 |
| // r2 = cnt1 |
| // r3 = str2 |
| // r4 = cnt2 |
| // r10 = tmp1 |
| // r11 = tmp2 |
| address generate_compare_long_string_different_encoding(bool isLU) { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", isLU |
| ? "compare_long_string_different_encoding LU" |
| : "compare_long_string_different_encoding UL"); |
| address entry = __ pc(); |
| Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, |
| DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, |
| LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; |
| Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, |
| tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; |
| FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; |
| RegSet spilled_regs = RegSet::of(tmp3, tmp4); |
| |
| int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); |
| |
| __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); |
| // cnt2 == amount of characters left to compare |
| // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) |
| __ zip1(vtmp, __ T8B, vtmp, vtmpZ); |
| __ add(str1, str1, isLU ? wordSize/2 : wordSize); |
| __ add(str2, str2, isLU ? wordSize : wordSize/2); |
| __ fmovd(isLU ? tmp1 : tmp2, vtmp); |
| __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. |
| __ eor(rscratch2, tmp1, tmp2); |
| __ mov(rscratch1, tmp2); |
| __ cbnz(rscratch2, CALCULATE_DIFFERENCE); |
| Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison |
| tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison |
| __ push(spilled_regs, sp); |
| __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load |
| __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load |
| |
| __ ldr(tmp3, Address(__ post(cnt1, 8))); |
| |
| if (SoftwarePrefetchHintDistance >= 0) { |
| __ subs(rscratch2, cnt2, prefetchLoopExitCondition); |
| __ br(__ LT, NO_PREFETCH); |
| __ bind(LARGE_LOOP_PREFETCH); |
| __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); |
| __ mov(tmp4, 2); |
| __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); |
| __ bind(LARGE_LOOP_PREFETCH_REPEAT1); |
| compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); |
| __ subs(tmp4, tmp4, 1); |
| __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); |
| __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); |
| __ mov(tmp4, 2); |
| __ bind(LARGE_LOOP_PREFETCH_REPEAT2); |
| compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); |
| __ subs(tmp4, tmp4, 1); |
| __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); |
| __ sub(cnt2, cnt2, 64); |
| __ subs(rscratch2, cnt2, prefetchLoopExitCondition); |
| __ br(__ GE, LARGE_LOOP_PREFETCH); |
| } |
| __ cbz(cnt2, LOAD_LAST); // no characters left except last load |
| __ bind(NO_PREFETCH); |
| __ subs(cnt2, cnt2, 16); |
| __ br(__ LT, TAIL); |
| __ align(OptoLoopAlignment); |
| __ bind(SMALL_LOOP); // smaller loop |
| __ subs(cnt2, cnt2, 16); |
| compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); |
| __ br(__ GE, SMALL_LOOP); |
| __ cmn(cnt2, (u1)16); |
| __ br(__ EQ, LOAD_LAST); |
| __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) |
| __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string |
| __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string |
| __ ldr(tmp3, Address(cnt1, -8)); |
| compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load |
| __ b(LOAD_LAST); |
| __ bind(DIFF2); |
| __ mov(tmpU, tmp3); |
| __ bind(DIFF1); |
| __ pop(spilled_regs, sp); |
| __ b(CALCULATE_DIFFERENCE); |
| __ bind(LOAD_LAST); |
| // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. |
| // No need to load it again |
| __ mov(tmpU, tmp3); |
| __ pop(spilled_regs, sp); |
| |
| // tmp2 points to the address of the last 4 Latin1 characters right now |
| __ ldrs(vtmp, Address(tmp2)); |
| __ zip1(vtmp, __ T8B, vtmp, vtmpZ); |
| __ fmovd(tmpL, vtmp); |
| |
| __ eor(rscratch2, tmpU, tmpL); |
| __ cbz(rscratch2, DONE); |
| |
| // Find the first different characters in the longwords and |
| // compute their difference. |
| __ bind(CALCULATE_DIFFERENCE); |
| __ rev(rscratch2, rscratch2); |
| __ clz(rscratch2, rscratch2); |
| __ andr(rscratch2, rscratch2, -16); |
| __ lsrv(tmp1, tmp1, rscratch2); |
| __ uxthw(tmp1, tmp1); |
| __ lsrv(rscratch1, rscratch1, rscratch2); |
| __ uxthw(rscratch1, rscratch1); |
| __ subw(result, tmp1, rscratch1); |
| __ bind(DONE); |
| __ ret(lr); |
| return entry; |
| } |
| |
| address generate_method_entry_barrier() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); |
| |
| Label deoptimize_label; |
| |
| address start = __ pc(); |
| |
| BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); |
| |
| if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { |
| BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); |
| // We can get here despite the nmethod being good, if we have not |
| // yet applied our cross modification fence (or data fence). |
| Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); |
| __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); |
| __ ldrw(rscratch2, rscratch2); |
| __ strw(rscratch2, thread_epoch_addr); |
| __ isb(); |
| __ membar(__ LoadLoad); |
| } |
| |
| __ set_last_Java_frame(sp, rfp, lr, rscratch1); |
| |
| __ enter(); |
| __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr |
| |
| __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} |
| |
| __ push_call_clobbered_registers(); |
| |
| __ mov(c_rarg0, rscratch2); |
| __ call_VM_leaf |
| (CAST_FROM_FN_PTR |
| (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); |
| |
| __ reset_last_Java_frame(true); |
| |
| __ mov(rscratch1, r0); |
| |
| __ pop_call_clobbered_registers(); |
| |
| __ cbnz(rscratch1, deoptimize_label); |
| |
| __ leave(); |
| __ ret(lr); |
| |
| __ BIND(deoptimize_label); |
| |
| __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); |
| __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); |
| |
| __ mov(sp, rscratch1); |
| __ br(rscratch2); |
| |
| return start; |
| } |
| |
| // r0 = result |
| // r1 = str1 |
| // r2 = cnt1 |
| // r3 = str2 |
| // r4 = cnt2 |
| // r10 = tmp1 |
| // r11 = tmp2 |
| address generate_compare_long_string_same_encoding(bool isLL) { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", isLL |
| ? "compare_long_string_same_encoding LL" |
| : "compare_long_string_same_encoding UU"); |
| address entry = __ pc(); |
| Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, |
| tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; |
| |
| Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; |
| |
| // exit from large loop when less than 64 bytes left to read or we're about |
| // to prefetch memory behind array border |
| int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); |
| |
| // before jumping to stub, pre-load 8 bytes already, so do comparison directly |
| __ eor(rscratch2, tmp1, tmp2); |
| __ cbnz(rscratch2, CAL_DIFFERENCE); |
| |
| __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); |
| // update pointers, because of previous read |
| __ add(str1, str1, wordSize); |
| __ add(str2, str2, wordSize); |
| if (SoftwarePrefetchHintDistance >= 0) { |
| __ align(OptoLoopAlignment); |
| __ bind(LARGE_LOOP_PREFETCH); |
| __ prfm(Address(str1, SoftwarePrefetchHintDistance)); |
| __ prfm(Address(str2, SoftwarePrefetchHintDistance)); |
| |
| for (int i = 0; i < 4; i++) { |
| __ ldp(tmp1, tmp1h, Address(str1, i * 16)); |
| __ ldp(tmp2, tmp2h, Address(str2, i * 16)); |
| __ cmp(tmp1, tmp2); |
| __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); |
| __ br(Assembler::NE, DIFF); |
| } |
| __ sub(cnt2, cnt2, isLL ? 64 : 32); |
| __ add(str1, str1, 64); |
| __ add(str2, str2, 64); |
| __ subs(rscratch2, cnt2, largeLoopExitCondition); |
| __ br(Assembler::GE, LARGE_LOOP_PREFETCH); |
| __ cbz(cnt2, LENGTH_DIFF); // no more chars left? |
| } |
| |
| __ subs(rscratch1, cnt2, isLL ? 16 : 8); |
| __ br(Assembler::LE, LESS16); |
| __ align(OptoLoopAlignment); |
| __ bind(LOOP_COMPARE16); |
| __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); |
| __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); |
| __ cmp(tmp1, tmp2); |
| __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); |
| __ br(Assembler::NE, DIFF); |
| __ sub(cnt2, cnt2, isLL ? 16 : 8); |
| __ subs(rscratch2, cnt2, isLL ? 16 : 8); |
| __ br(Assembler::LT, LESS16); |
| |
| __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); |
| __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); |
| __ cmp(tmp1, tmp2); |
| __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); |
| __ br(Assembler::NE, DIFF); |
| __ sub(cnt2, cnt2, isLL ? 16 : 8); |
| __ subs(rscratch2, cnt2, isLL ? 16 : 8); |
| __ br(Assembler::GE, LOOP_COMPARE16); |
| __ cbz(cnt2, LENGTH_DIFF); |
| |
| __ bind(LESS16); |
| // each 8 compare |
| __ subs(cnt2, cnt2, isLL ? 8 : 4); |
| __ br(Assembler::LE, LESS8); |
| __ ldr(tmp1, Address(__ post(str1, 8))); |
| __ ldr(tmp2, Address(__ post(str2, 8))); |
| __ eor(rscratch2, tmp1, tmp2); |
| __ cbnz(rscratch2, CAL_DIFFERENCE); |
| __ sub(cnt2, cnt2, isLL ? 8 : 4); |
| |
| __ bind(LESS8); // directly load last 8 bytes |
| if (!isLL) { |
| __ add(cnt2, cnt2, cnt2); |
| } |
| __ ldr(tmp1, Address(str1, cnt2)); |
| __ ldr(tmp2, Address(str2, cnt2)); |
| __ eor(rscratch2, tmp1, tmp2); |
| __ cbz(rscratch2, LENGTH_DIFF); |
| __ b(CAL_DIFFERENCE); |
| |
| __ bind(DIFF); |
| __ cmp(tmp1, tmp2); |
| __ csel(tmp1, tmp1, tmp1h, Assembler::NE); |
| __ csel(tmp2, tmp2, tmp2h, Assembler::NE); |
| // reuse rscratch2 register for the result of eor instruction |
| __ eor(rscratch2, tmp1, tmp2); |
| |
| __ bind(CAL_DIFFERENCE); |
| __ rev(rscratch2, rscratch2); |
| __ clz(rscratch2, rscratch2); |
| __ andr(rscratch2, rscratch2, isLL ? -8 : -16); |
| __ lsrv(tmp1, tmp1, rscratch2); |
| __ lsrv(tmp2, tmp2, rscratch2); |
| if (isLL) { |
| __ uxtbw(tmp1, tmp1); |
| __ uxtbw(tmp2, tmp2); |
| } else { |
| __ uxthw(tmp1, tmp1); |
| __ uxthw(tmp2, tmp2); |
| } |
| __ subw(result, tmp1, tmp2); |
| |
| __ bind(LENGTH_DIFF); |
| __ ret(lr); |
| return entry; |
| } |
| |
| enum string_compare_mode { |
| LL, |
| LU, |
| UL, |
| UU, |
| }; |
| |
| // The following registers are declared in aarch64.ad |
| // r0 = result |
| // r1 = str1 |
| // r2 = cnt1 |
| // r3 = str2 |
| // r4 = cnt2 |
| // r10 = tmp1 |
| // r11 = tmp2 |
| // z0 = ztmp1 |
| // z1 = ztmp2 |
| // p0 = pgtmp1 |
| // p1 = pgtmp2 |
| address generate_compare_long_string_sve(string_compare_mode mode) { |
| __ align(CodeEntryAlignment); |
| address entry = __ pc(); |
| Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, |
| tmp1 = r10, tmp2 = r11; |
| |
| Label LOOP, DONE, MISMATCH; |
| Register vec_len = tmp1; |
| Register idx = tmp2; |
| // The minimum of the string lengths has been stored in cnt2. |
| Register cnt = cnt2; |
| FloatRegister ztmp1 = z0, ztmp2 = z1; |
| PRegister pgtmp1 = p0, pgtmp2 = p1; |
| |
| #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ |
| switch (mode) { \ |
| case LL: \ |
| __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ |
| __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ |
| break; \ |
| case LU: \ |
| __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ |
| __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ |
| break; \ |
| case UL: \ |
| __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ |
| __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ |
| break; \ |
| case UU: \ |
| __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ |
| __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ |
| break; \ |
| default: \ |
| ShouldNotReachHere(); \ |
| } |
| |
| const char* stubname; |
| switch (mode) { |
| case LL: stubname = "compare_long_string_same_encoding LL"; break; |
| case LU: stubname = "compare_long_string_different_encoding LU"; break; |
| case UL: stubname = "compare_long_string_different_encoding UL"; break; |
| case UU: stubname = "compare_long_string_same_encoding UU"; break; |
| default: ShouldNotReachHere(); |
| } |
| |
| StubCodeMark mark(this, "StubRoutines", stubname); |
| |
| __ mov(idx, 0); |
| __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); |
| |
| if (mode == LL) { |
| __ sve_cntb(vec_len); |
| } else { |
| __ sve_cnth(vec_len); |
| } |
| |
| __ sub(rscratch1, cnt, vec_len); |
| |
| __ bind(LOOP); |
| |
| // main loop |
| LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); |
| __ add(idx, idx, vec_len); |
| // Compare strings. |
| __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); |
| __ br(__ NE, MISMATCH); |
| __ cmp(idx, rscratch1); |
| __ br(__ LT, LOOP); |
| |
| // post loop, last iteration |
| __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); |
| |
| LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); |
| __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); |
| __ br(__ EQ, DONE); |
| |
| __ bind(MISMATCH); |
| |
| // Crop the vector to find its location. |
| __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); |
| // Extract the first different characters of each string. |
| __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); |
| __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); |
| |
| // Compute the difference of the first different characters. |
| __ sub(result, rscratch1, rscratch2); |
| |
| __ bind(DONE); |
| __ ret(lr); |
| #undef LOAD_PAIR |
| return entry; |
| } |
| |
| void generate_compare_long_strings() { |
| if (UseSVE == 0) { |
| StubRoutines::aarch64::_compare_long_string_LL |
| = generate_compare_long_string_same_encoding(true); |
| StubRoutines::aarch64::_compare_long_string_UU |
| = generate_compare_long_string_same_encoding(false); |
| StubRoutines::aarch64::_compare_long_string_LU |
| = generate_compare_long_string_different_encoding(true); |
| StubRoutines::aarch64::_compare_long_string_UL |
| = generate_compare_long_string_different_encoding(false); |
| } else { |
| StubRoutines::aarch64::_compare_long_string_LL |
| = generate_compare_long_string_sve(LL); |
| StubRoutines::aarch64::_compare_long_string_UU |
| = generate_compare_long_string_sve(UU); |
| StubRoutines::aarch64::_compare_long_string_LU |
| = generate_compare_long_string_sve(LU); |
| StubRoutines::aarch64::_compare_long_string_UL |
| = generate_compare_long_string_sve(UL); |
| } |
| } |
| |
| // R0 = result |
| // R1 = str2 |
| // R2 = cnt1 |
| // R3 = str1 |
| // R4 = cnt2 |
| // Clobbers: rscratch1, rscratch2, v0, v1, rflags |
| // |
| // This generic linear code use few additional ideas, which makes it faster: |
| // 1) we can safely keep at least 1st register of pattern(since length >= 8) |
| // in order to skip initial loading(help in systems with 1 ld pipeline) |
| // 2) we can use "fast" algorithm of finding single character to search for |
| // first symbol with less branches(1 branch per each loaded register instead |
| // of branch for each symbol), so, this is where constants like |
| // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from |
| // 3) after loading and analyzing 1st register of source string, it can be |
| // used to search for every 1st character entry, saving few loads in |
| // comparison with "simplier-but-slower" implementation |
| // 4) in order to avoid lots of push/pop operations, code below is heavily |
| // re-using/re-initializing/compressing register values, which makes code |
| // larger and a bit less readable, however, most of extra operations are |
| // issued during loads or branches, so, penalty is minimal |
| address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { |
| const char* stubName = str1_isL |
| ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") |
| : "indexof_linear_uu"; |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", stubName); |
| address entry = __ pc(); |
| |
| int str1_chr_size = str1_isL ? 1 : 2; |
| int str2_chr_size = str2_isL ? 1 : 2; |
| int str1_chr_shift = str1_isL ? 0 : 1; |
| int str2_chr_shift = str2_isL ? 0 : 1; |
| bool isL = str1_isL && str2_isL; |
| // parameters |
| Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; |
| // temporary registers |
| Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; |
| RegSet spilled_regs = RegSet::range(tmp1, tmp4); |
| // redefinitions |
| Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; |
| |
| __ push(spilled_regs, sp); |
| Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, |
| L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, |
| L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, |
| L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, |
| L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, |
| L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; |
| // Read whole register from str1. It is safe, because length >=8 here |
| __ ldr(ch1, Address(str1)); |
| // Read whole register from str2. It is safe, because length >=8 here |
| __ ldr(ch2, Address(str2)); |
| __ sub(cnt2, cnt2, cnt1); |
| __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); |
| if (str1_isL != str2_isL) { |
| __ eor(v0, __ T16B, v0, v0); |
| } |
| __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); |
| __ mul(first, first, tmp1); |
| // check if we have less than 1 register to check |
| __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); |
| if (str1_isL != str2_isL) { |
| __ fmovd(v1, ch1); |
| } |
| __ br(__ LE, L_SMALL); |
| __ eor(ch2, first, ch2); |
| if (str1_isL != str2_isL) { |
| __ zip1(v1, __ T16B, v1, v0); |
| } |
| __ sub(tmp2, ch2, tmp1); |
| __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); |
| __ bics(tmp2, tmp2, ch2); |
| if (str1_isL != str2_isL) { |
| __ fmovd(ch1, v1); |
| } |
| __ br(__ NE, L_HAS_ZERO); |
| __ subs(cnt2, cnt2, wordSize/str2_chr_size); |
| __ add(result, result, wordSize/str2_chr_size); |
| __ add(str2, str2, wordSize); |
| __ br(__ LT, L_POST_LOOP); |
| __ BIND(L_LOOP); |
| __ ldr(ch2, Address(str2)); |
| __ eor(ch2, first, ch2); |
| __ sub(tmp2, ch2, tmp1); |
| __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); |
| __ bics(tmp2, tmp2, ch2); |
| __ br(__ NE, L_HAS_ZERO); |
| __ BIND(L_LOOP_PROCEED); |
| __ subs(cnt2, cnt2, wordSize/str2_chr_size); |
| __ add(str2, str2, wordSize); |
| __ add(result, result, wordSize/str2_chr_size); |
| __ br(__ GE, L_LOOP); |
| __ BIND(L_POST_LOOP); |
| __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check |
| __ br(__ LE, NOMATCH); |
| __ ldr(ch2, Address(str2)); |
| __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); |
| __ eor(ch2, first, ch2); |
| __ sub(tmp2, ch2, tmp1); |
| __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); |
| __ mov(tmp4, -1); // all bits set |
| __ b(L_SMALL_PROCEED); |
| __ align(OptoLoopAlignment); |
| __ BIND(L_SMALL); |
| __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); |
| __ eor(ch2, first, ch2); |
| if (str1_isL != str2_isL) { |
| __ zip1(v1, __ T16B, v1, v0); |
| } |
| __ sub(tmp2, ch2, tmp1); |
| __ mov(tmp4, -1); // all bits set |
| __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); |
| if (str1_isL != str2_isL) { |
| __ fmovd(ch1, v1); // move converted 4 symbols |
| } |
| __ BIND(L_SMALL_PROCEED); |
| __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. |
| __ bic(tmp2, tmp2, ch2); |
| __ ands(tmp2, tmp2, tmp4); // clear useless bits and check |
| __ rbit(tmp2, tmp2); |
| __ br(__ EQ, NOMATCH); |
| __ BIND(L_SMALL_HAS_ZERO_LOOP); |
| __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's |
| __ cmp(cnt1, u1(wordSize/str2_chr_size)); |
| __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); |
| if (str2_isL) { // LL |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" |
| __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. |
| __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info |
| __ add(result, result, tmp4, __ LSR, LogBitsPerByte); |
| __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info |
| } else { |
| __ mov(ch2, 0xE); // all bits in byte set except last one |
| __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount |
| __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. |
| __ lslv(tmp2, tmp2, tmp4); |
| __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| } |
| __ cmp(ch1, ch2); |
| __ mov(tmp4, wordSize/str2_chr_size); |
| __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); |
| __ BIND(L_SMALL_CMP_LOOP); |
| str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) |
| : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); |
| str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) |
| : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); |
| __ add(tmp4, tmp4, 1); |
| __ cmp(tmp4, cnt1); |
| __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); |
| __ cmp(first, ch2); |
| __ br(__ EQ, L_SMALL_CMP_LOOP); |
| __ BIND(L_SMALL_CMP_LOOP_NOMATCH); |
| __ cbz(tmp2, NOMATCH); // no more matches. exit |
| __ clz(tmp4, tmp2); |
| __ add(result, result, 1); // advance index |
| __ add(str2, str2, str2_chr_size); // advance pointer |
| __ b(L_SMALL_HAS_ZERO_LOOP); |
| __ align(OptoLoopAlignment); |
| __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); |
| __ cmp(first, ch2); |
| __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); |
| __ b(DONE); |
| __ align(OptoLoopAlignment); |
| __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); |
| if (str2_isL) { // LL |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" |
| __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. |
| __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info |
| __ add(result, result, tmp4, __ LSR, LogBitsPerByte); |
| __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info |
| } else { |
| __ mov(ch2, 0xE); // all bits in byte set except last one |
| __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount |
| __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. |
| __ lslv(tmp2, tmp2, tmp4); |
| __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| } |
| __ cmp(ch1, ch2); |
| __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); |
| __ b(DONE); |
| __ align(OptoLoopAlignment); |
| __ BIND(L_HAS_ZERO); |
| __ rbit(tmp2, tmp2); |
| __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's |
| // Now, perform compression of counters(cnt2 and cnt1) into one register. |
| // It's fine because both counters are 32bit and are not changed in this |
| // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. |
| __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); |
| __ sub(result, result, 1); |
| __ BIND(L_HAS_ZERO_LOOP); |
| __ mov(cnt1, wordSize/str2_chr_size); |
| __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); |
| __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare |
| if (str2_isL) { |
| __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index |
| __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. |
| __ lslv(tmp2, tmp2, tmp4); |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ add(tmp4, tmp4, 1); |
| __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ lsl(tmp2, tmp2, 1); |
| __ mov(tmp4, wordSize/str2_chr_size); |
| } else { |
| __ mov(ch2, 0xE); |
| __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount |
| __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. |
| __ lslv(tmp2, tmp2, tmp4); |
| __ add(tmp4, tmp4, 1); |
| __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); |
| __ lsl(tmp2, tmp2, 1); |
| __ mov(tmp4, wordSize/str2_chr_size); |
| __ sub(str2, str2, str2_chr_size); |
| } |
| __ cmp(ch1, ch2); |
| __ mov(tmp4, wordSize/str2_chr_size); |
| __ br(__ NE, L_CMP_LOOP_NOMATCH); |
| __ BIND(L_CMP_LOOP); |
| str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) |
| : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); |
| str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) |
| : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); |
| __ add(tmp4, tmp4, 1); |
| __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); |
| __ br(__ GE, L_CMP_LOOP_LAST_CMP); |
| __ cmp(cnt1, ch2); |
| __ br(__ EQ, L_CMP_LOOP); |
| __ BIND(L_CMP_LOOP_NOMATCH); |
| // here we're not matched |
| __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop |
| __ clz(tmp4, tmp2); |
| __ add(str2, str2, str2_chr_size); // advance pointer |
| __ b(L_HAS_ZERO_LOOP); |
| __ align(OptoLoopAlignment); |
| __ BIND(L_CMP_LOOP_LAST_CMP); |
| __ cmp(cnt1, ch2); |
| __ br(__ NE, L_CMP_LOOP_NOMATCH); |
| __ b(DONE); |
| __ align(OptoLoopAlignment); |
| __ BIND(L_CMP_LOOP_LAST_CMP2); |
| if (str2_isL) { |
| __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index |
| __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. |
| __ lslv(tmp2, tmp2, tmp4); |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ add(tmp4, tmp4, 1); |
| __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ lsl(tmp2, tmp2, 1); |
| } else { |
| __ mov(ch2, 0xE); |
| __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount |
| __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. |
| __ lslv(tmp2, tmp2, tmp4); |
| __ add(tmp4, tmp4, 1); |
| __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); |
| __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); |
| __ lsl(tmp2, tmp2, 1); |
| __ sub(str2, str2, str2_chr_size); |
| } |
| __ cmp(ch1, ch2); |
| __ br(__ NE, L_CMP_LOOP_NOMATCH); |
| __ b(DONE); |
| __ align(OptoLoopAlignment); |
| __ BIND(L_HAS_ZERO_LOOP_NOMATCH); |
| // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until |
| // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, |
| // so, result was increased at max by wordSize/str2_chr_size - 1, so, |
| // respective high bit wasn't changed. L_LOOP_PROCEED will increase |
| // result by analyzed characters value, so, we can just reset lower bits |
| // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL |
| // 2) restore cnt1 and cnt2 values from "compressed" cnt2 |
| // 3) advance str2 value to represent next str2 octet. result & 7/3 is |
| // index of last analyzed substring inside current octet. So, str2 in at |
| // respective start address. We need to advance it to next octet |
| __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed |
| __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); |
| __ bfm(result, zr, 0, 2 - str2_chr_shift); |
| __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 |
| __ movw(cnt2, cnt2); |
| __ b(L_LOOP_PROCEED); |
| __ align(OptoLoopAlignment); |
| __ BIND(NOMATCH); |
| __ mov(result, -1); |
| __ BIND(DONE); |
| __ pop(spilled_regs, sp); |
| __ ret(lr); |
| return entry; |
| } |
| |
| void generate_string_indexof_stubs() { |
| StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); |
| StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); |
| StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); |
| } |
| |
| void inflate_and_store_2_fp_registers(bool generatePrfm, |
| FloatRegister src1, FloatRegister src2) { |
| Register dst = r1; |
| __ zip1(v1, __ T16B, src1, v0); |
| __ zip2(v2, __ T16B, src1, v0); |
| if (generatePrfm) { |
| __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); |
| } |
| __ zip1(v3, __ T16B, src2, v0); |
| __ zip2(v4, __ T16B, src2, v0); |
| __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); |
| } |
| |
| // R0 = src |
| // R1 = dst |
| // R2 = len |
| // R3 = len >> 3 |
| // V0 = 0 |
| // v1 = loaded 8 bytes |
| // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 |
| address generate_large_byte_array_inflate() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); |
| address entry = __ pc(); |
| Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; |
| Register src = r0, dst = r1, len = r2, octetCounter = r3; |
| const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; |
| |
| // do one more 8-byte read to have address 16-byte aligned in most cases |
| // also use single store instruction |
| __ ldrd(v2, __ post(src, 8)); |
| __ sub(octetCounter, octetCounter, 2); |
| __ zip1(v1, __ T16B, v1, v0); |
| __ zip1(v2, __ T16B, v2, v0); |
| __ st1(v1, v2, __ T16B, __ post(dst, 32)); |
| __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); |
| __ subs(rscratch1, octetCounter, large_loop_threshold); |
| __ br(__ LE, LOOP_START); |
| __ b(LOOP_PRFM_START); |
| __ bind(LOOP_PRFM); |
| __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); |
| __ bind(LOOP_PRFM_START); |
| __ prfm(Address(src, SoftwarePrefetchHintDistance)); |
| __ sub(octetCounter, octetCounter, 8); |
| __ subs(rscratch1, octetCounter, large_loop_threshold); |
| inflate_and_store_2_fp_registers(true, v3, v4); |
| inflate_and_store_2_fp_registers(true, v5, v6); |
| __ br(__ GT, LOOP_PRFM); |
| __ cmp(octetCounter, (u1)8); |
| __ br(__ LT, DONE); |
| __ bind(LOOP); |
| __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); |
| __ bind(LOOP_START); |
| __ sub(octetCounter, octetCounter, 8); |
| __ cmp(octetCounter, (u1)8); |
| inflate_and_store_2_fp_registers(false, v3, v4); |
| inflate_and_store_2_fp_registers(false, v5, v6); |
| __ br(__ GE, LOOP); |
| __ bind(DONE); |
| __ ret(lr); |
| return entry; |
| } |
| |
| /** |
| * Arguments: |
| * |
| * Input: |
| * c_rarg0 - current state address |
| * c_rarg1 - H key address |
| * c_rarg2 - data address |
| * c_rarg3 - number of blocks |
| * |
| * Output: |
| * Updated state at c_rarg0 |
| */ |
| address generate_ghash_processBlocks() { |
| // Bafflingly, GCM uses little-endian for the byte order, but |
| // big-endian for the bit order. For example, the polynomial 1 is |
| // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. |
| // |
| // So, we must either reverse the bytes in each word and do |
| // everything big-endian or reverse the bits in each byte and do |
| // it little-endian. On AArch64 it's more idiomatic to reverse |
| // the bits in each byte (we have an instruction, RBIT, to do |
| // that) and keep the data in little-endian bit order through the |
| // calculation, bit-reversing the inputs and outputs. |
| |
| StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); |
| __ align(wordSize * 2); |
| address p = __ pc(); |
| __ emit_int64(0x87); // The low-order bits of the field |
| // polynomial (i.e. p = z^7+z^2+z+1) |
| // repeated in the low and high parts of a |
| // 128-bit vector |
| __ emit_int64(0x87); |
| |
| __ align(CodeEntryAlignment); |
| address start = __ pc(); |
| |
| Register state = c_rarg0; |
| Register subkeyH = c_rarg1; |
| Register data = c_rarg2; |
| Register blocks = c_rarg3; |
| |
| FloatRegister vzr = v30; |
| __ eor(vzr, __ T16B, vzr, vzr); // zero register |
| |
| __ ldrq(v24, p); // The field polynomial |
| |
| __ ldrq(v0, Address(state)); |
| __ ldrq(v1, Address(subkeyH)); |
| |
| __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH |
| __ rbit(v0, __ T16B, v0); |
| __ rev64(v1, __ T16B, v1); |
| __ rbit(v1, __ T16B, v1); |
| |
| __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 |
| __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) |
| |
| { |
| Label L_ghash_loop; |
| __ bind(L_ghash_loop); |
| |
| __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit |
| // reversing each byte |
| __ rbit(v2, __ T16B, v2); |
| __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state |
| |
| // Multiply state in v2 by subkey in v1 |
| __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, |
| /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, |
| /*temps*/v6, v3, /*reuse/clobber b*/v2); |
| // Reduce v7:v5 by the field polynomial |
| __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); |
| |
| __ sub(blocks, blocks, 1); |
| __ cbnz(blocks, L_ghash_loop); |
| } |
| |
| // The bit-reversed result is at this point in v0 |
| __ rev64(v0, __ T16B, v0); |
| __ rbit(v0, __ T16B, v0); |
| |
| __ st1(v0, __ T16B, state); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| address generate_ghash_processBlocks_wide() { |
| address small = generate_ghash_processBlocks(); |
| |
| StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); |
| __ align(wordSize * 2); |
| address p = __ pc(); |
| __ emit_int64(0x87); // The low-order bits of the field |
| // polynomial (i.e. p = z^7+z^2+z+1) |
| // repeated in the low and high parts of a |
| // 128-bit vector |
| __ emit_int64(0x87); |
| |
| __ align(CodeEntryAlignment); |
| address start = __ pc(); |
| |
| Register state = c_rarg0; |
| Register subkeyH = c_rarg1; |
| Register data = c_rarg2; |
| Register blocks = c_rarg3; |
| |
| const int unroll = 4; |
| |
| __ cmp(blocks, (unsigned char)(unroll * 2)); |
| __ br(__ LT, small); |
| |
| if (unroll > 1) { |
| // Save state before entering routine |
| __ sub(sp, sp, 4 * 16); |
| __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); |
| __ sub(sp, sp, 4 * 16); |
| __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); |
| } |
| |
| __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); |
| |
| if (unroll > 1) { |
| // And restore state |
| __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); |
| __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); |
| } |
| |
| __ cmp(blocks, (unsigned char)0); |
| __ br(__ GT, small); |
| |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| void generate_base64_encode_simdround(Register src, Register dst, |
| FloatRegister codec, u8 size) { |
| |
| FloatRegister in0 = v4, in1 = v5, in2 = v6; |
| FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; |
| FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; |
| |
| Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; |
| |
| __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); |
| |
| __ ushr(ind0, arrangement, in0, 2); |
| |
| __ ushr(ind1, arrangement, in1, 2); |
| __ shl(in0, arrangement, in0, 6); |
| __ orr(ind1, arrangement, ind1, in0); |
| __ ushr(ind1, arrangement, ind1, 2); |
| |
| __ ushr(ind2, arrangement, in2, 4); |
| __ shl(in1, arrangement, in1, 4); |
| __ orr(ind2, arrangement, in1, ind2); |
| __ ushr(ind2, arrangement, ind2, 2); |
| |
| __ shl(ind3, arrangement, in2, 2); |
| __ ushr(ind3, arrangement, ind3, 2); |
| |
| __ tbl(out0, arrangement, codec, 4, ind0); |
| __ tbl(out1, arrangement, codec, 4, ind1); |
| __ tbl(out2, arrangement, codec, 4, ind2); |
| __ tbl(out3, arrangement, codec, 4, ind3); |
| |
| __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); |
| } |
| |
| /** |
| * Arguments: |
| * |
| * Input: |
| * c_rarg0 - src_start |
| * c_rarg1 - src_offset |
| * c_rarg2 - src_length |
| * c_rarg3 - dest_start |
| * c_rarg4 - dest_offset |
| * c_rarg5 - isURL |
| * |
| */ |
| address generate_base64_encodeBlock() { |
| |
| static const char toBase64[64] = { |
| 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', |
| 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', |
| 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', |
| 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', |
| '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' |
| }; |
| |
| static const char toBase64URL[64] = { |
| 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', |
| 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', |
| 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', |
| 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', |
| '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' |
| }; |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "encodeBlock"); |
| address start = __ pc(); |
| |
| Register src = c_rarg0; // source array |
| Register soff = c_rarg1; // source start offset |
| Register send = c_rarg2; // source end offset |
| Register dst = c_rarg3; // dest array |
| Register doff = c_rarg4; // position for writing to dest array |
| Register isURL = c_rarg5; // Base64 or URL character set |
| |
| // c_rarg6 and c_rarg7 are free to use as temps |
| Register codec = c_rarg6; |
| Register length = c_rarg7; |
| |
| Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; |
| |
| __ add(src, src, soff); |
| __ add(dst, dst, doff); |
| __ sub(length, send, soff); |
| |
| // load the codec base address |
| __ lea(codec, ExternalAddress((address) toBase64)); |
| __ cbz(isURL, ProcessData); |
| __ lea(codec, ExternalAddress((address) toBase64URL)); |
| |
| __ BIND(ProcessData); |
| |
| // too short to formup a SIMD loop, roll back |
| __ cmp(length, (u1)24); |
| __ br(Assembler::LT, Process3B); |
| |
| __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); |
| |
| __ BIND(Process48B); |
| __ cmp(length, (u1)48); |
| __ br(Assembler::LT, Process24B); |
| generate_base64_encode_simdround(src, dst, v0, 16); |
| __ sub(length, length, 48); |
| __ b(Process48B); |
| |
| __ BIND(Process24B); |
| __ cmp(length, (u1)24); |
| __ br(Assembler::LT, SIMDExit); |
| generate_base64_encode_simdround(src, dst, v0, 8); |
| __ sub(length, length, 24); |
| |
| __ BIND(SIMDExit); |
| __ cbz(length, Exit); |
| |
| __ BIND(Process3B); |
| // 3 src bytes, 24 bits |
| __ ldrb(r10, __ post(src, 1)); |
| __ ldrb(r11, __ post(src, 1)); |
| __ ldrb(r12, __ post(src, 1)); |
| __ orrw(r11, r11, r10, Assembler::LSL, 8); |
| __ orrw(r12, r12, r11, Assembler::LSL, 8); |
| // codec index |
| __ ubfmw(r15, r12, 18, 23); |
| __ ubfmw(r14, r12, 12, 17); |
| __ ubfmw(r13, r12, 6, 11); |
| __ andw(r12, r12, 63); |
| // get the code based on the codec |
| __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); |
| __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); |
| __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); |
| __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); |
| __ strb(r15, __ post(dst, 1)); |
| __ strb(r14, __ post(dst, 1)); |
| __ strb(r13, __ post(dst, 1)); |
| __ strb(r12, __ post(dst, 1)); |
| __ sub(length, length, 3); |
| __ cbnz(length, Process3B); |
| |
| __ BIND(Exit); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| void generate_base64_decode_simdround(Register src, Register dst, |
| FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { |
| |
| FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; |
| FloatRegister out0 = v20, out1 = v21, out2 = v22; |
| |
| FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; |
| FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; |
| |
| Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; |
| |
| Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; |
| |
| __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); |
| |
| // we need unsigned saturating subtract, to make sure all input values |
| // in range [0, 63] will have 0U value in the higher half lookup |
| __ uqsubv(decH0, __ T16B, in0, v27); |
| __ uqsubv(decH1, __ T16B, in1, v27); |
| __ uqsubv(decH2, __ T16B, in2, v27); |
| __ uqsubv(decH3, __ T16B, in3, v27); |
| |
| // lower half lookup |
| __ tbl(decL0, arrangement, codecL, 4, in0); |
| __ tbl(decL1, arrangement, codecL, 4, in1); |
| __ tbl(decL2, arrangement, codecL, 4, in2); |
| __ tbl(decL3, arrangement, codecL, 4, in3); |
| |
| // higher half lookup |
| __ tbx(decH0, arrangement, codecH, 4, decH0); |
| __ tbx(decH1, arrangement, codecH, 4, decH1); |
| __ tbx(decH2, arrangement, codecH, 4, decH2); |
| __ tbx(decH3, arrangement, codecH, 4, decH3); |
| |
| // combine lower and higher |
| __ orr(decL0, arrangement, decL0, decH0); |
| __ orr(decL1, arrangement, decL1, decH1); |
| __ orr(decL2, arrangement, decL2, decH2); |
| __ orr(decL3, arrangement, decL3, decH3); |
| |
| // check illegal inputs, value larger than 63 (maximum of 6 bits) |
| __ cm(Assembler::HI, decH0, arrangement, decL0, v27); |
| __ cm(Assembler::HI, decH1, arrangement, decL1, v27); |
| __ cm(Assembler::HI, decH2, arrangement, decL2, v27); |
| __ cm(Assembler::HI, decH3, arrangement, decL3, v27); |
| __ orr(in0, arrangement, decH0, decH1); |
| __ orr(in1, arrangement, decH2, decH3); |
| __ orr(in2, arrangement, in0, in1); |
| __ umaxv(in3, arrangement, in2); |
| __ umov(rscratch2, in3, __ B, 0); |
| |
| // get the data to output |
| __ shl(out0, arrangement, decL0, 2); |
| __ ushr(out1, arrangement, decL1, 4); |
| __ orr(out0, arrangement, out0, out1); |
| __ shl(out1, arrangement, decL1, 4); |
| __ ushr(out2, arrangement, decL2, 2); |
| __ orr(out1, arrangement, out1, out2); |
| __ shl(out2, arrangement, decL2, 6); |
| __ orr(out2, arrangement, out2, decL3); |
| |
| __ cbz(rscratch2, NoIllegalData); |
| |
| // handle illegal input |
| __ umov(r10, in2, __ D, 0); |
| if (size == 16) { |
| __ cbnz(r10, ErrorInLowerHalf); |
| |
| // illegal input is in higher half, store the lower half now. |
| __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); |
| |
| __ umov(r10, in2, __ D, 1); |
| __ umov(r11, out0, __ D, 1); |
| __ umov(r12, out1, __ D, 1); |
| __ umov(r13, out2, __ D, 1); |
| __ b(StoreLegalData); |
| |
| __ BIND(ErrorInLowerHalf); |
| } |
| __ umov(r11, out0, __ D, 0); |
| __ umov(r12, out1, __ D, 0); |
| __ umov(r13, out2, __ D, 0); |
| |
| __ BIND(StoreLegalData); |
| __ tbnz(r10, 5, Exit); // 0xff indicates illegal input |
| __ strb(r11, __ post(dst, 1)); |
| __ strb(r12, __ post(dst, 1)); |
| __ strb(r13, __ post(dst, 1)); |
| __ lsr(r10, r10, 8); |
| __ lsr(r11, r11, 8); |
| __ lsr(r12, r12, 8); |
| __ lsr(r13, r13, 8); |
| __ b(StoreLegalData); |
| |
| __ BIND(NoIllegalData); |
| __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); |
| } |
| |
| |
| /** |
| * Arguments: |
| * |
| * Input: |
| * c_rarg0 - src_start |
| * c_rarg1 - src_offset |
| * c_rarg2 - src_length |
| * c_rarg3 - dest_start |
| * c_rarg4 - dest_offset |
| * c_rarg5 - isURL |
| * c_rarg6 - isMIME |
| * |
| */ |
| address generate_base64_decodeBlock() { |
| |
| // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined |
| // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section |
| // titled "Base64 decoding". |
| |
| // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, |
| // except the trailing character '=' is also treated illegal value in this intrinsic. That |
| // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. |
| static const uint8_t fromBase64ForNoSIMD[256] = { |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, |
| 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, |
| 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, |
| 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| }; |
| |
| static const uint8_t fromBase64URLForNoSIMD[256] = { |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, |
| 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, |
| 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, |
| 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, |
| 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| }; |
| |
| // A legal value of base64 code is in range [0, 127]. We need two lookups |
| // with tbl/tbx and combine them to get the decode data. The 1st table vector |
| // lookup use tbl, out of range indices are set to 0 in destination. The 2nd |
| // table vector lookup use tbx, out of range indices are unchanged in |
| // destination. Input [64..126] is mapped to index [65, 127] in second lookup. |
| // The value of index 64 is set to 0, so that we know that we already get the |
| // decoded data with the 1st lookup. |
| static const uint8_t fromBase64ForSIMD[128] = { |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, |
| 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, |
| 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, |
| 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, |
| }; |
| |
| static const uint8_t fromBase64URLForSIMD[128] = { |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, |
| 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, |
| 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, |
| 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, |
| 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, |
| 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, |
| }; |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "decodeBlock"); |
| address start = __ pc(); |
| |
| Register src = c_rarg0; // source array |
| Register soff = c_rarg1; // source start offset |
| Register send = c_rarg2; // source end offset |
| Register dst = c_rarg3; // dest array |
| Register doff = c_rarg4; // position for writing to dest array |
| Register isURL = c_rarg5; // Base64 or URL character set |
| Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation |
| |
| Register length = send; // reuse send as length of source data to process |
| |
| Register simd_codec = c_rarg6; |
| Register nosimd_codec = c_rarg7; |
| |
| Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; |
| |
| __ enter(); |
| |
| __ add(src, src, soff); |
| __ add(dst, dst, doff); |
| |
| __ mov(doff, dst); |
| |
| __ sub(length, send, soff); |
| __ bfm(length, zr, 0, 1); |
| |
| __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); |
| __ cbz(isURL, ProcessData); |
| __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); |
| |
| __ BIND(ProcessData); |
| __ mov(rscratch1, length); |
| __ cmp(length, (u1)144); // 144 = 80 + 64 |
| __ br(Assembler::LT, Process4B); |
| |
| // In the MIME case, the line length cannot be more than 76 |
| // bytes (see RFC 2045). This is too short a block for SIMD |
| // to be worthwhile, so we use non-SIMD here. |
| __ movw(rscratch1, 79); |
| |
| __ BIND(Process4B); |
| __ ldrw(r14, __ post(src, 4)); |
| __ ubfxw(r10, r14, 0, 8); |
| __ ubfxw(r11, r14, 8, 8); |
| __ ubfxw(r12, r14, 16, 8); |
| __ ubfxw(r13, r14, 24, 8); |
| // get the de-code |
| __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); |
| __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); |
| __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); |
| __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); |
| // error detection, 255u indicates an illegal input |
| __ orrw(r14, r10, r11); |
| __ orrw(r15, r12, r13); |
| __ orrw(r14, r14, r15); |
| __ tbnz(r14, 7, Exit); |
| // recover the data |
| __ lslw(r14, r10, 10); |
| __ bfiw(r14, r11, 4, 6); |
| __ bfmw(r14, r12, 2, 5); |
| __ rev16w(r14, r14); |
| __ bfiw(r13, r12, 6, 2); |
| __ strh(r14, __ post(dst, 2)); |
| __ strb(r13, __ post(dst, 1)); |
| // non-simd loop |
| __ subsw(rscratch1, rscratch1, 4); |
| __ br(Assembler::GT, Process4B); |
| |
| // if exiting from PreProcess80B, rscratch1 == -1; |
| // otherwise, rscratch1 == 0. |
| __ cbzw(rscratch1, Exit); |
| __ sub(length, length, 80); |
| |
| __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); |
| __ cbz(isURL, SIMDEnter); |
| __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); |
| |
| __ BIND(SIMDEnter); |
| __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); |
| __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); |
| __ mov(rscratch1, 63); |
| __ dup(v27, __ T16B, rscratch1); |
| |
| __ BIND(Process64B); |
| __ cmp(length, (u1)64); |
| __ br(Assembler::LT, Process32B); |
| generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); |
| __ sub(length, length, 64); |
| __ b(Process64B); |
| |
| __ BIND(Process32B); |
| __ cmp(length, (u1)32); |
| __ br(Assembler::LT, SIMDExit); |
| generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); |
| __ sub(length, length, 32); |
| __ b(Process32B); |
| |
| __ BIND(SIMDExit); |
| __ cbz(length, Exit); |
| __ movw(rscratch1, length); |
| __ b(Process4B); |
| |
| __ BIND(Exit); |
| __ sub(c_rarg0, dst, doff); |
| |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| // Support for spin waits. |
| address generate_spin_wait() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "spin_wait"); |
| address start = __ pc(); |
| |
| __ spin_wait(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) |
| |
| // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. |
| // |
| // If LSE is in use, generate LSE versions of all the stubs. The |
| // non-LSE versions are in atomic_aarch64.S. |
| |
| // class AtomicStubMark records the entry point of a stub and the |
| // stub pointer which will point to it. The stub pointer is set to |
| // the entry point when ~AtomicStubMark() is called, which must be |
| // after ICache::invalidate_range. This ensures safe publication of |
| // the generated code. |
| class AtomicStubMark { |
| address _entry_point; |
| aarch64_atomic_stub_t *_stub; |
| MacroAssembler *_masm; |
| public: |
| AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { |
| _masm = masm; |
| __ align(32); |
| _entry_point = __ pc(); |
| _stub = stub; |
| } |
| ~AtomicStubMark() { |
| *_stub = (aarch64_atomic_stub_t)_entry_point; |
| } |
| }; |
| |
| // NB: For memory_order_conservative we need a trailing membar after |
| // LSE atomic operations but not a leading membar. |
| // |
| // We don't need a leading membar because a clause in the Arm ARM |
| // says: |
| // |
| // Barrier-ordered-before |
| // |
| // Barrier instructions order prior Memory effects before subsequent |
| // Memory effects generated by the same Observer. A read or a write |
| // RW1 is Barrier-ordered-before a read or a write RW 2 from the same |
| // Observer if and only if RW1 appears in program order before RW 2 |
| // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic |
| // instruction with both Acquire and Release semantics. |
| // |
| // All the atomic instructions {ldaddal, swapal, casal} have Acquire |
| // and Release semantics, therefore we don't need a leading |
| // barrier. However, there is no corresponding Barrier-ordered-after |
| // relationship, therefore we need a trailing membar to prevent a |
| // later store or load from being reordered with the store in an |
| // atomic instruction. |
| // |
| // This was checked by using the herd7 consistency model simulator |
| // (http://diy.inria.fr/) with this test case: |
| // |
| // AArch64 LseCas |
| // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } |
| // P0 | P1; |
| // LDR W4, [X2] | MOV W3, #0; |
| // DMB LD | MOV W4, #1; |
| // LDR W3, [X1] | CASAL W3, W4, [X1]; |
| // | DMB ISH; |
| // | STR W4, [X2]; |
| // exists |
| // (0:X3=0 /\ 0:X4=1) |
| // |
| // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered |
| // with the store to x in P1. Without the DMB in P1 this may happen. |
| // |
| // At the time of writing we don't know of any AArch64 hardware that |
| // reorders stores in this way, but the Reference Manual permits it. |
| |
| void gen_cas_entry(Assembler::operand_size size, |
| atomic_memory_order order) { |
| Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, |
| exchange_val = c_rarg2; |
| bool acquire, release; |
| switch (order) { |
| case memory_order_relaxed: |
| acquire = false; |
| release = false; |
| break; |
| case memory_order_release: |
| acquire = false; |
| release = true; |
| break; |
| default: |
| acquire = true; |
| release = true; |
| break; |
| } |
| __ mov(prev, compare_val); |
| __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); |
| if (order == memory_order_conservative) { |
| __ membar(Assembler::StoreStore|Assembler::StoreLoad); |
| } |
| if (size == Assembler::xword) { |
| __ mov(r0, prev); |
| } else { |
| __ movw(r0, prev); |
| } |
| __ ret(lr); |
| } |
| |
| void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { |
| Register prev = r2, addr = c_rarg0, incr = c_rarg1; |
| // If not relaxed, then default to conservative. Relaxed is the only |
| // case we use enough to be worth specializing. |
| if (order == memory_order_relaxed) { |
| __ ldadd(size, incr, prev, addr); |
| } else { |
| __ ldaddal(size, incr, prev, addr); |
| __ membar(Assembler::StoreStore|Assembler::StoreLoad); |
| } |
| if (size == Assembler::xword) { |
| __ mov(r0, prev); |
| } else { |
| __ movw(r0, prev); |
| } |
| __ ret(lr); |
| } |
| |
| void gen_swpal_entry(Assembler::operand_size size) { |
| Register prev = r2, addr = c_rarg0, incr = c_rarg1; |
| __ swpal(size, incr, prev, addr); |
| __ membar(Assembler::StoreStore|Assembler::StoreLoad); |
| if (size == Assembler::xword) { |
| __ mov(r0, prev); |
| } else { |
| __ movw(r0, prev); |
| } |
| __ ret(lr); |
| } |
| |
| void generate_atomic_entry_points() { |
| if (! UseLSE) { |
| return; |
| } |
| |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "atomic entry points"); |
| address first_entry = __ pc(); |
| |
| // ADD, memory_order_conservative |
| AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); |
| gen_ldadd_entry(Assembler::word, memory_order_conservative); |
| AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); |
| gen_ldadd_entry(Assembler::xword, memory_order_conservative); |
| |
| // ADD, memory_order_relaxed |
| AtomicStubMark mark_fetch_add_4_relaxed |
| (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); |
| gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); |
| AtomicStubMark mark_fetch_add_8_relaxed |
| (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); |
| gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); |
| |
| // XCHG, memory_order_conservative |
| AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); |
| gen_swpal_entry(Assembler::word); |
| AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); |
| gen_swpal_entry(Assembler::xword); |
| |
| // CAS, memory_order_conservative |
| AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); |
| gen_cas_entry(MacroAssembler::byte, memory_order_conservative); |
| AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); |
| gen_cas_entry(MacroAssembler::word, memory_order_conservative); |
| AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); |
| gen_cas_entry(MacroAssembler::xword, memory_order_conservative); |
| |
| // CAS, memory_order_relaxed |
| AtomicStubMark mark_cmpxchg_1_relaxed |
| (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); |
| gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); |
| AtomicStubMark mark_cmpxchg_4_relaxed |
| (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); |
| gen_cas_entry(MacroAssembler::word, memory_order_relaxed); |
| AtomicStubMark mark_cmpxchg_8_relaxed |
| (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); |
| gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); |
| |
| AtomicStubMark mark_cmpxchg_4_release |
| (_masm, &aarch64_atomic_cmpxchg_4_release_impl); |
| gen_cas_entry(MacroAssembler::word, memory_order_release); |
| AtomicStubMark mark_cmpxchg_8_release |
| (_masm, &aarch64_atomic_cmpxchg_8_release_impl); |
| gen_cas_entry(MacroAssembler::xword, memory_order_release); |
| |
| AtomicStubMark mark_cmpxchg_4_seq_cst |
| (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); |
| gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); |
| AtomicStubMark mark_cmpxchg_8_seq_cst |
| (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); |
| gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); |
| |
| ICache::invalidate_range(first_entry, __ pc() - first_entry); |
| } |
| #endif // LINUX |
| |
| address generate_cont_thaw(Continuation::thaw_kind kind) { |
| bool return_barrier = Continuation::is_thaw_return_barrier(kind); |
| bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); |
| |
| address start = __ pc(); |
| |
| if (return_barrier) { |
| __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); |
| __ mov(sp, rscratch1); |
| } |
| assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); |
| |
| if (return_barrier) { |
| // preserve possible return value from a method returning to the return barrier |
| __ fmovd(rscratch1, v0); |
| __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); |
| } |
| |
| __ movw(c_rarg1, (return_barrier ? 1 : 0)); |
| __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); |
| __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames |
| |
| if (return_barrier) { |
| // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) |
| __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); |
| __ fmovd(v0, rscratch1); |
| } |
| assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); |
| |
| |
| Label thaw_success; |
| // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames |
| __ cbnz(rscratch2, thaw_success); |
| __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry())); |
| __ br(rscratch1); |
| __ bind(thaw_success); |
| |
| // make room for the thawed frames |
| __ sub(rscratch1, sp, rscratch2); |
| __ andr(rscratch1, rscratch1, -16); // align |
| __ mov(sp, rscratch1); |
| |
| if (return_barrier) { |
| // save original return value -- again |
| __ fmovd(rscratch1, v0); |
| __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); |
| } |
| |
| // If we want, we can templatize thaw by kind, and have three different entries |
| __ movw(c_rarg1, (uint32_t)kind); |
| |
| __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); |
| __ mov(rscratch2, r0); // r0 is the sp of the yielding frame |
| |
| if (return_barrier) { |
| // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) |
| __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); |
| __ fmovd(v0, rscratch1); |
| } else { |
| __ mov(r0, zr); // return 0 (success) from doYield |
| } |
| |
| // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) |
| __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill |
| __ mov(rfp, sp); |
| |
| if (return_barrier_exception) { |
| __ ldr(c_rarg1, Address(rfp, wordSize)); // return address |
| __ verify_oop(r0); |
| __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19 |
| |
| __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); |
| |
| // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. |
| // __ reinitialize_ptrue(); |
| |
| // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc |
| |
| __ mov(r1, r0); // the exception handler |
| __ mov(r0, r19); // restore return value contaning the exception oop |
| __ verify_oop(r0); |
| |
| __ leave(); |
| __ mov(r3, lr); |
| __ br(r1); // the exception handler |
| } else { |
| // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame |
| __ leave(); |
| __ ret(lr); |
| } |
| |
| return start; |
| } |
| |
| address generate_cont_thaw() { |
| if (!Continuations::enabled()) return nullptr; |
| |
| StubCodeMark mark(this, "StubRoutines", "Cont thaw"); |
| address start = __ pc(); |
| generate_cont_thaw(Continuation::thaw_top); |
| return start; |
| } |
| |
| address generate_cont_returnBarrier() { |
| if (!Continuations::enabled()) return nullptr; |
| |
| // TODO: will probably need multiple return barriers depending on return type |
| StubCodeMark mark(this, "StubRoutines", "cont return barrier"); |
| address start = __ pc(); |
| |
| generate_cont_thaw(Continuation::thaw_return_barrier); |
| |
| return start; |
| } |
| |
| address generate_cont_returnBarrier_exception() { |
| if (!Continuations::enabled()) return nullptr; |
| |
| StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); |
| address start = __ pc(); |
| |
| generate_cont_thaw(Continuation::thaw_return_barrier_exception); |
| |
| return start; |
| } |
| |
| // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers |
| // are represented as long[5], with BITS_PER_LIMB = 26. |
| // Pack five 26-bit limbs into three 64-bit registers. |
| void pack_26(Register dest0, Register dest1, Register dest2, Register src) { |
| __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits |
| __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits |
| __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); |
| __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits |
| |
| __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits |
| __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits |
| __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); |
| __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits |
| |
| if (dest2->is_valid()) { |
| __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits |
| } else { |
| #ifdef ASSERT |
| Label OK; |
| __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits |
| __ br(__ EQ, OK); |
| __ stop("high bits of Poly1305 integer should be zero"); |
| __ should_not_reach_here(); |
| __ bind(OK); |
| #endif |
| } |
| } |
| |
| // As above, but return only a 128-bit integer, packed into two |
| // 64-bit registers. |
| void pack_26(Register dest0, Register dest1, Register src) { |
| pack_26(dest0, dest1, noreg, src); |
| } |
| |
| // Multiply and multiply-accumulate unsigned 64-bit registers. |
| void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { |
| __ mul(prod_lo, n, m); |
| __ umulh(prod_hi, n, m); |
| } |
| void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { |
| wide_mul(rscratch1, rscratch2, n, m); |
| __ adds(sum_lo, sum_lo, rscratch1); |
| __ adc(sum_hi, sum_hi, rscratch2); |
| } |
| |
| // Poly1305, RFC 7539 |
| |
| // See https://loup-vaillant.fr/tutorials/poly1305-design for a |
| // description of the tricks used to simplify and accelerate this |
| // computation. |
| |
| address generate_poly1305_processBlocks() { |
| __ align(CodeEntryAlignment); |
| StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); |
| address start = __ pc(); |
| Label here; |
| __ enter(); |
| RegSet callee_saved = RegSet::range(r19, r28); |
| __ push(callee_saved, sp); |
| |
| RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); |
| |
| // Arguments |
| const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; |
| |
| // R_n is the 128-bit randomly-generated key, packed into two |
| // registers. The caller passes this key to us as long[5], with |
| // BITS_PER_LIMB = 26. |
| const Register R_0 = *++regs, R_1 = *++regs; |
| pack_26(R_0, R_1, r_start); |
| |
| // RR_n is (R_n >> 2) * 5 |
| const Register RR_0 = *++regs, RR_1 = *++regs; |
| __ lsr(RR_0, R_0, 2); |
| __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); |
| __ lsr(RR_1, R_1, 2); |
| __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); |
| |
| // U_n is the current checksum |
| const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; |
| pack_26(U_0, U_1, U_2, acc_start); |
| |
| static constexpr int BLOCK_LENGTH = 16; |
| Label DONE, LOOP; |
| |
| __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); |
| __ br(Assembler::LT, DONE); { |
| __ bind(LOOP); |
| |
| // S_n is to be the sum of U_n and the next block of data |
| const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; |
| __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); |
| __ adds(S_0, U_0, S_0); |
| __ adcs(S_1, U_1, S_1); |
| __ adc(S_2, U_2, zr); |
| __ add(S_2, S_2, 1); |
| |
| const Register U_0HI = *++regs, U_1HI = *++regs; |
| |
| // NB: this logic depends on some of the special properties of |
| // Poly1305 keys. In particular, because we know that the top |
| // four bits of R_0 and R_1 are zero, we can add together |
| // partial products without any risk of needing to propagate a |
| // carry out. |
| wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); |
| wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); |
| __ andr(U_2, R_0, 3); |
| __ mul(U_2, S_2, U_2); |
| |
| // Recycle registers S_0, S_1, S_2 |
| regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); |
| |
| // Partial reduction mod 2**130 - 5 |
| __ adds(U_1, U_0HI, U_1); |
| __ adc(U_2, U_1HI, U_2); |
| // Sum now in U_2:U_1:U_0. |
| // Dead: U_0HI, U_1HI. |
| regs = (regs.remaining() + U_0HI + U_1HI).begin(); |
| |
| // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps |
| |
| // First, U_2:U_1:U_0 += (U_2 >> 2) |
| __ lsr(rscratch1, U_2, 2); |
| __ andr(U_2, U_2, (u8)3); |
| __ adds(U_0, U_0, rscratch1); |
| __ adcs(U_1, U_1, zr); |
| __ adc(U_2, U_2, zr); |
| // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 |
| __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); |
| __ adcs(U_1, U_1, zr); |
| __ adc(U_2, U_2, zr); |
| |
| __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); |
| __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); |
| __ br(~ Assembler::LT, LOOP); |
| } |
| |
| // Further reduce modulo 2^130 - 5 |
| __ lsr(rscratch1, U_2, 2); |
| __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 |
| __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 |
| __ adcs(U_1, U_1, zr); |
| __ andr(U_2, U_2, (u1)3); |
| __ adc(U_2, U_2, zr); |
| |
| // Unpack the sum into five 26-bit limbs and write to memory. |
| __ ubfiz(rscratch1, U_0, 0, 26); |
| __ ubfx(rscratch2, U_0, 26, 26); |
| __ stp(rscratch1, rscratch2, Address(acc_start)); |
| __ ubfx(rscratch1, U_0, 52, 12); |
| __ bfi(rscratch1, U_1, 12, 14); |
| __ ubfx(rscratch2, U_1, 14, 26); |
| __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); |
| __ ubfx(rscratch1, U_1, 40, 24); |
| __ bfi(rscratch1, U_2, 24, 3); |
| __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); |
| |
| __ bind(DONE); |
| __ pop(callee_saved, sp); |
| __ leave(); |
| __ ret(lr); |
| |
| return start; |
| } |
| |
| #if INCLUDE_JFR |
| |
| static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { |
| __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); |
| __ mov(c_rarg0, thread); |
| } |
| |
| // The handle is dereferenced through a load barrier. |
| static void jfr_epilogue(MacroAssembler* _masm) { |
| __ reset_last_Java_frame(true); |
| } |
| |
| // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. |
| // It returns a jobject handle to the event writer. |
| // The handle is dereferenced and the return value is the event writer oop. |
| static RuntimeStub* generate_jfr_write_checkpoint() { |
| enum layout { |
| rbp_off, |
| rbpH_off, |
| return_off, |
| return_off2, |
| framesize // inclusive of return address |
| }; |
| |
| int insts_size = 1024; |
| int locs_size = 64; |
| CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size); |
| OopMapSet* oop_maps = new OopMapSet(); |
| MacroAssembler* masm = new MacroAssembler(&code); |
| MacroAssembler* _masm = masm; |
| |
| address start = __ pc(); |
| __ enter(); |
| int frame_complete = __ pc() - start; |
| address the_pc = __ pc(); |
| jfr_prologue(the_pc, _masm, rthread); |
| __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); |
| jfr_epilogue(_masm); |
| __ resolve_global_jobject(r0, rscratch1, rscratch2); |
| __ leave(); |
| __ ret(lr); |
| |
| OopMap* map = new OopMap(framesize, 1); // rfp |
| oop_maps->add_gc_map(the_pc - start, map); |
| |
| RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) |
| RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete, |
| (framesize >> (LogBytesPerWord - LogBytesPerInt)), |
| oop_maps, false); |
| return stub; |
| } |
| |
| // For c2: call to return a leased buffer. |
| static RuntimeStub* generate_jfr_return_lease() { |
| enum layout { |
| rbp_off, |
| rbpH_off, |
| return_off, |
| return_off2, |
| framesize // inclusive of return address |
| }; |
| |
| int insts_size = 1024; |
| int locs_size = 64; |
| CodeBuffer code("jfr_return_lease", insts_size, locs_size); |
| OopMapSet* oop_maps = new OopMapSet(); |
| MacroAssembler* masm = new MacroAssembler(&code); |
| MacroAssembler* _masm = masm; |
| |
| address start = __ pc(); |
| __ enter(); |
| int frame_complete = __ pc() - start; |
| address the_pc = __ pc(); |
| jfr_prologue(the_pc, _masm, rthread); |
| __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); |
| jfr_epilogue(_masm); |
| |
| __ leave(); |
| __ ret(lr); |
| |
| OopMap* map = new OopMap(framesize, 1); // rfp |
| oop_maps->add_gc_map(the_pc - start, map); |
| |
| RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) |
| RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete, |
| (framesize >> (LogBytesPerWord - LogBytesPerInt)), |
| oop_maps, false); |
| return stub; |
| } |
| |
| #endif // INCLUDE_JFR |
| |
| // Continuation point for throwing of implicit exceptions that are |
| // not handled in the current activation. Fabricates an exception |
| // oop and initiates normal exception dispatching in this |
| // frame. Since we need to preserve callee-saved values (currently |
| // only for C2, but done for C1 as well) we need a callee-saved oop |
| // map and therefore have to make these stubs into RuntimeStubs |
| // rather than BufferBlobs. If the compiler needs all registers to |
| // be preserved between the fault point and the exception handler |
| // then it must assume responsibility for that in |
| // AbstractCompiler::continuation_for_implicit_null_exception or |
| // continuation_for_implicit_division_by_zero_exception. All other |
| // implicit exceptions (e.g., NullPointerException or |
| // AbstractMethodError on entry) are either at call sites or |
| // otherwise assume that stack unwinding will be initiated, so |
| // caller saved registers were assumed volatile in the compiler. |
| |
| #undef __ |
| #define __ masm-> |
| |
| address generate_throw_exception(const char* name, |
| address runtime_entry, |
| Register arg1 = noreg, |
| Register arg2 = noreg) { |
| // Information about frame layout at time of blocking runtime call. |
| // Note that we only have to preserve callee-saved registers since |
| // the compilers are responsible for supplying a continuation point |
| // if they expect all registers to be preserved. |
| // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 |
| enum layout { |
| rfp_off = 0, |
| rfp_off2, |
| return_off, |
| return_off2, |
| framesize // inclusive of return address |
| }; |
| |
| int insts_size = 512; |
| int locs_size = 64; |
| |
| CodeBuffer code(name, insts_size, locs_size); |
| OopMapSet* oop_maps = new OopMapSet(); |
| MacroAssembler* masm = new MacroAssembler(&code); |
| |
| address start = __ pc(); |
| |
| // This is an inlined and slightly modified version of call_VM |
| // which has the ability to fetch the return PC out of |
| // thread-local storage and also sets up last_Java_sp slightly |
| // differently than the real call_VM |
| |
| __ enter(); // Save FP and LR before call |
| |
| assert(is_even(framesize/2), "sp not 16-byte aligned"); |
| |
| // lr and fp are already in place |
| __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog |
| |
| int frame_complete = __ pc() - start; |
| |
| // Set up last_Java_sp and last_Java_fp |
| address the_pc = __ pc(); |
| __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); |
| |
| // Call runtime |
| if (arg1 != noreg) { |
| assert(arg2 != c_rarg1, "clobbered"); |
| __ mov(c_rarg1, arg1); |
| } |
| if (arg2 != noreg) { |
| __ mov(c_rarg2, arg2); |
| } |
| __ mov(c_rarg0, rthread); |
| BLOCK_COMMENT("call runtime_entry"); |
| __ mov(rscratch1, runtime_entry); |
| __ blr(rscratch1); |
| |
| // Generate oop map |
| OopMap* map = new OopMap(framesize, 0); |
| |
| oop_maps->add_gc_map(the_pc - start, map); |
| |
| __ reset_last_Java_frame(true); |
| |
| // Reinitialize the ptrue predicate register, in case the external runtime |
| // call clobbers ptrue reg, as we may return to SVE compiled code. |
| __ reinitialize_ptrue(); |
| |
| __ leave(); |
| |
| // check for pending exceptions |
| #ifdef ASSERT |
| Label L; |
| __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); |
| __ cbnz(rscratch1, L); |
| __ should_not_reach_here(); |
| __ bind(L); |
| #endif // ASSERT |
| __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); |
| |
| // codeBlob framesize is in words (not VMRegImpl::slot_size) |
| RuntimeStub* stub = |
| RuntimeStub::new_runtime_stub(name, |
| &code, |
| frame_complete, |
| (framesize >> (LogBytesPerWord - LogBytesPerInt)), |
| oop_maps, false); |
| return stub->entry_point(); |
| } |
| |
| class MontgomeryMultiplyGenerator : public MacroAssembler { |
| |
| Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, |
| Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; |
| |
| RegSet _toSave; |
| bool _squaring; |
| |
| public: |
| MontgomeryMultiplyGenerator (Assembler *as, bool squaring) |
| : MacroAssembler(as->code()), _squaring(squaring) { |
| |
| // Register allocation |
| |
| RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); |
| Pa_base = *regs; // Argument registers |
| if (squaring) |
| Pb_base = Pa_base; |
| else |
| Pb_base = *++regs; |
| Pn_base = *++regs; |
| Rlen= *++regs; |
| inv = *++regs; |
| Pm_base = *++regs; |
| |
| // Working registers: |
| Ra = *++regs; // The current digit of a, b, n, and m. |
| Rb = *++regs; |
| Rm = *++regs; |
| Rn = *++regs; |
| |
| Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. |
| Pb = *++regs; |
| Pm = *++regs; |
| Pn = *++regs; |
| |
| t0 = *++regs; // Three registers which form a |
| t1 = *++regs; // triple-precision accumuator. |
| t2 = *++regs; |
| |
| Ri = *++regs; // Inner and outer loop indexes. |
| Rj = *++regs; |
| |
| Rhi_ab = *++regs; // Product registers: low and high parts |
| Rlo_ab = *++regs; // of a*b and m*n. |
| Rhi_mn = *++regs; |
| Rlo_mn = *++regs; |
| |
| // r19 and up are callee-saved. |
| _toSave = RegSet::range(r19, *regs) + Pm_base; |
| } |
| |
| private: |
| void save_regs() { |
| push(_toSave, sp); |
| } |
| |
| void restore_regs() { |
| pop(_toSave, sp); |
| } |
| |
| template <typename T> |
| void unroll_2(Register count, T block) { |
| Label loop, end, odd; |
| tbnz(count, 0, odd); |
| cbz(count, end); |
| align(16); |
| bind(loop); |
| (this->*block)(); |
| bind(odd); |
| (this->*block)(); |
| subs(count, count, 2); |
| br(Assembler::GT, loop); |
| bind(end); |
| } |
| |
| template <typename T> |
| void unroll_2(Register count, T block, Register d, Register s, Register tmp) { |
| Label loop, end, odd; |
| tbnz(count, 0, odd); |
| cbz(count, end); |
| align(16); |
| bind(loop); |
| (this->*block)(d, s, tmp); |
| bind(odd); |
| (this->*block)(d, s, tmp); |
| subs(count, count, 2); |
| br(Assembler::GT, loop); |
| bind(end); |
| } |
| |
| void pre1(RegisterOrConstant i) { |
| block_comment("pre1"); |
| // Pa = Pa_base; |
| // Pb = Pb_base + i; |
| // Pm = Pm_base; |
| // Pn = Pn_base + i; |
| // Ra = *Pa; |
| // Rb = *Pb; |
| // Rm = *Pm; |
| // Rn = *Pn; |
| ldr(Ra, Address(Pa_base)); |
| ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); |
| ldr(Rm, Address(Pm_base)); |
| ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); |
| lea(Pa, Address(Pa_base)); |
| lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); |
| lea(Pm, Address(Pm_base)); |
| lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); |
| |
| // Zero the m*n result. |
| mov(Rhi_mn, zr); |
| mov(Rlo_mn, zr); |
| } |
| |
| // The core multiply-accumulate step of a Montgomery |
| // multiplication. The idea is to schedule operations as a |
| // pipeline so that instructions with long latencies (loads and |
| // multiplies) have time to complete before their results are |
| // used. This most benefits in-order implementations of the |
| // architecture but out-of-order ones also benefit. |
| void step() { |
| block_comment("step"); |
| // MACC(Ra, Rb, t0, t1, t2); |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| umulh(Rhi_ab, Ra, Rb); |
| mul(Rlo_ab, Ra, Rb); |
| ldr(Ra, pre(Pa, wordSize)); |
| ldr(Rb, pre(Pb, -wordSize)); |
| acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the |
| // previous iteration. |
| // MACC(Rm, Rn, t0, t1, t2); |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| umulh(Rhi_mn, Rm, Rn); |
| mul(Rlo_mn, Rm, Rn); |
| ldr(Rm, pre(Pm, wordSize)); |
| ldr(Rn, pre(Pn, -wordSize)); |
| acc(Rhi_ab, Rlo_ab, t0, t1, t2); |
| } |
| |
| void post1() { |
| block_comment("post1"); |
| |
| // MACC(Ra, Rb, t0, t1, t2); |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| umulh(Rhi_ab, Ra, Rb); |
| mul(Rlo_ab, Ra, Rb); |
| acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n |
| acc(Rhi_ab, Rlo_ab, t0, t1, t2); |
| |
| // *Pm = Rm = t0 * inv; |
| mul(Rm, t0, inv); |
| str(Rm, Address(Pm)); |
| |
| // MACC(Rm, Rn, t0, t1, t2); |
| // t0 = t1; t1 = t2; t2 = 0; |
| umulh(Rhi_mn, Rm, Rn); |
| |
| #ifndef PRODUCT |
| // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); |
| { |
| mul(Rlo_mn, Rm, Rn); |
| add(Rlo_mn, t0, Rlo_mn); |
| Label ok; |
| cbz(Rlo_mn, ok); { |
| stop("broken Montgomery multiply"); |
| } bind(ok); |
| } |
| #endif |
| // We have very carefully set things up so that |
| // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate |
| // the lower half of Rm * Rn because we know the result already: |
| // it must be -t0. t0 + (-t0) must generate a carry iff |
| // t0 != 0. So, rather than do a mul and an adds we just set |
| // the carry flag iff t0 is nonzero. |
| // |
| // mul(Rlo_mn, Rm, Rn); |
| // adds(zr, t0, Rlo_mn); |
| subs(zr, t0, 1); // Set carry iff t0 is nonzero |
| adcs(t0, t1, Rhi_mn); |
| adc(t1, t2, zr); |
| mov(t2, zr); |
| } |
| |
| void pre2(RegisterOrConstant i, RegisterOrConstant len) { |
| block_comment("pre2"); |
| // Pa = Pa_base + i-len; |
| // Pb = Pb_base + len; |
| // Pm = Pm_base + i-len; |
| // Pn = Pn_base + len; |
| |
| if (i.is_register()) { |
| sub(Rj, i.as_register(), len); |
| } else { |
| mov(Rj, i.as_constant()); |
| sub(Rj, Rj, len); |
| } |
| // Rj == i-len |
| |
| lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); |
| lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); |
| lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); |
| lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); |
| |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| ldr(Ra, pre(Pa, wordSize)); |
| ldr(Rb, pre(Pb, -wordSize)); |
| ldr(Rm, pre(Pm, wordSize)); |
| ldr(Rn, pre(Pn, -wordSize)); |
| |
| mov(Rhi_mn, zr); |
| mov(Rlo_mn, zr); |
| } |
| |
| void post2(RegisterOrConstant i, RegisterOrConstant len) { |
| block_comment("post2"); |
| if (i.is_constant()) { |
| mov(Rj, i.as_constant()-len.as_constant()); |
| } else { |
| sub(Rj, i.as_register(), len); |
| } |
| |
| adds(t0, t0, Rlo_mn); // The pending m*n, low part |
| |
| // As soon as we know the least significant digit of our result, |
| // store it. |
| // Pm_base[i-len] = t0; |
| str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); |
| |
| // t0 = t1; t1 = t2; t2 = 0; |
| adcs(t0, t1, Rhi_mn); // The pending m*n, high part |
| adc(t1, t2, zr); |
| mov(t2, zr); |
| } |
| |
| // A carry in t0 after Montgomery multiplication means that we |
| // should subtract multiples of n from our result in m. We'll |
| // keep doing that until there is no carry. |
| void normalize(RegisterOrConstant len) { |
| block_comment("normalize"); |
| // while (t0) |
| // t0 = sub(Pm_base, Pn_base, t0, len); |
| Label loop, post, again; |
| Register cnt = t1, i = t2; // Re-use registers; we're done with them now |
| cbz(t0, post); { |
| bind(again); { |
| mov(i, zr); |
| mov(cnt, len); |
| ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); |
| ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); |
| subs(zr, zr, zr); // set carry flag, i.e. no borrow |
| align(16); |
| bind(loop); { |
| sbcs(Rm, Rm, Rn); |
| str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); |
| add(i, i, 1); |
| ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); |
| ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); |
| sub(cnt, cnt, 1); |
| } cbnz(cnt, loop); |
| sbc(t0, t0, zr); |
| } cbnz(t0, again); |
| } bind(post); |
| } |
| |
| // Move memory at s to d, reversing words. |
| // Increments d to end of copied memory |
| // Destroys tmp1, tmp2 |
| // Preserves len |
| // Leaves s pointing to the address which was in d at start |
| void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { |
| assert(tmp1->encoding() < r19->encoding(), "register corruption"); |
| assert(tmp2->encoding() < r19->encoding(), "register corruption"); |
| |
| lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); |
| mov(tmp1, len); |
| unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); |
| sub(s, d, len, ext::uxtw, LogBytesPerWord); |
| } |
| // where |
| void reverse1(Register d, Register s, Register tmp) { |
| ldr(tmp, pre(s, -wordSize)); |
| ror(tmp, tmp, 32); |
| str(tmp, post(d, wordSize)); |
| } |
| |
| void step_squaring() { |
| // An extra ACC |
| step(); |
| acc(Rhi_ab, Rlo_ab, t0, t1, t2); |
| } |
| |
| void last_squaring(RegisterOrConstant i) { |
| Label dont; |
| // if ((i & 1) == 0) { |
| tbnz(i.as_register(), 0, dont); { |
| // MACC(Ra, Rb, t0, t1, t2); |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| umulh(Rhi_ab, Ra, Rb); |
| mul(Rlo_ab, Ra, Rb); |
| acc(Rhi_ab, Rlo_ab, t0, t1, t2); |
| } bind(dont); |
| } |
| |
| void extra_step_squaring() { |
| acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n |
| |
| // MACC(Rm, Rn, t0, t1, t2); |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| umulh(Rhi_mn, Rm, Rn); |
| mul(Rlo_mn, Rm, Rn); |
| ldr(Rm, pre(Pm, wordSize)); |
| ldr(Rn, pre(Pn, -wordSize)); |
| } |
| |
| void post1_squaring() { |
| acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n |
| |
| // *Pm = Rm = t0 * inv; |
| mul(Rm, t0, inv); |
| str(Rm, Address(Pm)); |
| |
| // MACC(Rm, Rn, t0, t1, t2); |
| // t0 = t1; t1 = t2; t2 = 0; |
| umulh(Rhi_mn, Rm, Rn); |
| |
| #ifndef PRODUCT |
| // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); |
| { |
| mul(Rlo_mn, Rm, Rn); |
| add(Rlo_mn, t0, Rlo_mn); |
| Label ok; |
| cbz(Rlo_mn, ok); { |
| stop("broken Montgomery multiply"); |
| } bind(ok); |
| } |
| #endif |
| // We have very carefully set things up so that |
| // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate |
| // the lower half of Rm * Rn because we know the result already: |
| // it must be -t0. t0 + (-t0) must generate a carry iff |
| // t0 != 0. So, rather than do a mul and an adds we just set |
| // the carry flag iff t0 is nonzero. |
| // |
| // mul(Rlo_mn, Rm, Rn); |
| // adds(zr, t0, Rlo_mn); |
| subs(zr, t0, 1); // Set carry iff t0 is nonzero |
| adcs(t0, t1, Rhi_mn); |
| adc(t1, t2, zr); |
| mov(t2, zr); |
| } |
| |
| void acc(Register Rhi, Register Rlo, |
| Register t0, Register t1, Register t2) { |
| adds(t0, t0, Rlo); |
| adcs(t1, t1, Rhi); |
| adc(t2, t2, zr); |
| } |
| |
| public: |
| /** |
| * Fast Montgomery multiplication. The derivation of the |
| * algorithm is in A Cryptographic Library for the Motorola |
| * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. |
| * |
| * Arguments: |
| * |
| * Inputs for multiplication: |
| * c_rarg0 - int array elements a |
| * c_rarg1 - int array elements b |
| * c_rarg2 - int array elements n (the modulus) |
| * c_rarg3 - int length |
| * c_rarg4 - int inv |
| * c_rarg5 - int array elements m (the result) |
| * |
| * Inputs for squaring: |
| * c_rarg0 - int array elements a |
| * c_rarg1 - int array elements n (the modulus) |
| * c_rarg2 - int length |
| * c_rarg3 - int inv |
| * c_rarg4 - int array elements m (the result) |
| * |
| */ |
| address generate_multiply() { |
| Label argh, nothing; |
| bind(argh); |
| stop("MontgomeryMultiply total_allocation must be <= 8192"); |
| |
| align(CodeEntryAlignment); |
| address entry = pc(); |
| |
| cbzw(Rlen, nothing); |
| |
| enter(); |
| |
| // Make room. |
| cmpw(Rlen, 512); |
| br(Assembler::HI, argh); |
| sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); |
| andr(sp, Ra, -2 * wordSize); |
| |
| lsrw(Rlen, Rlen, 1); // length in longwords = len/2 |
| |
| { |
| // Copy input args, reversing as we go. We use Ra as a |
| // temporary variable. |
| reverse(Ra, Pa_base, Rlen, t0, t1); |
| if (!_squaring) |
| reverse(Ra, Pb_base, Rlen, t0, t1); |
| reverse(Ra, Pn_base, Rlen, t0, t1); |
| } |
| |
| // Push all call-saved registers and also Pm_base which we'll need |
| // at the end. |
| save_regs(); |
| |
| #ifndef PRODUCT |
| // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); |
| { |
| ldr(Rn, Address(Pn_base, 0)); |
| mul(Rlo_mn, Rn, inv); |
| subs(zr, Rlo_mn, -1); |
| Label ok; |
| br(EQ, ok); { |
| stop("broken inverse in Montgomery multiply"); |
| } bind(ok); |
| } |
| #endif |
| |
| mov(Pm_base, Ra); |
| |
| mov(t0, zr); |
| mov(t1, zr); |
| mov(t2, zr); |
| |
| block_comment("for (int i = 0; i < len; i++) {"); |
| mov(Ri, zr); { |
| Label loop, end; |
| cmpw(Ri, Rlen); |
| br(Assembler::GE, end); |
| |
| bind(loop); |
| pre1(Ri); |
| |
| block_comment(" for (j = i; j; j--) {"); { |
| movw(Rj, Ri); |
| unroll_2(Rj, &MontgomeryMultiplyGenerator::step); |
| } block_comment(" } // j"); |
| |
| post1(); |
| addw(Ri, Ri, 1); |
| cmpw(Ri, Rlen); |
| br(Assembler::LT, loop); |
| bind(end); |
| block_comment("} // i"); |
| } |
| |
| block_comment("for (int i = len; i < 2*len; i++) {"); |
| mov(Ri, Rlen); { |
| Label loop, end; |
| cmpw(Ri, Rlen, Assembler::LSL, 1); |
| br(Assembler::GE, end); |
| |
| bind(loop); |
| pre2(Ri, Rlen); |
| |
| block_comment(" for (j = len*2-i-1; j; j--) {"); { |
| lslw(Rj, Rlen, 1); |
| subw(Rj, Rj, Ri); |
| subw(Rj, Rj, 1); |
| unroll_2(Rj, &MontgomeryMultiplyGenerator::step); |
| } block_comment(" } // j"); |
| |
| post2(Ri, Rlen); |
| addw(Ri, Ri, 1); |
| cmpw(Ri, Rlen, Assembler::LSL, 1); |
| br(Assembler::LT, loop); |
| bind(end); |
| } |
| block_comment("} // i"); |
| |
| normalize(Rlen); |
| |
| mov(Ra, Pm_base); // Save Pm_base in Ra |
| restore_regs(); // Restore caller's Pm_base |
| |
| // Copy our result into caller's Pm_base |
| reverse(Pm_base, Ra, Rlen, t0, t1); |
| |
| leave(); |
| bind(nothing); |
| ret(lr); |
| |
| return entry; |
| } |
| // In C, approximately: |
| |
| // void |
| // montgomery_multiply(julong Pa_base[], julong Pb_base[], |
| // julong Pn_base[], julong Pm_base[], |
| // julong inv, int len) { |
| // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator |
| // julong *Pa, *Pb, *Pn, *Pm; |
| // julong Ra, Rb, Rn, Rm; |
| |
| // int i; |
| |
| // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); |
| |
| // for (i = 0; i < len; i++) { |
| // int j; |
| |
| // Pa = Pa_base; |
| // Pb = Pb_base + i; |
| // Pm = Pm_base; |
| // Pn = Pn_base + i; |
| |
| // Ra = *Pa; |
| // Rb = *Pb; |
| // Rm = *Pm; |
| // Rn = *Pn; |
| |
| // int iters = i; |
| // for (j = 0; iters--; j++) { |
| // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); |
| // MACC(Ra, Rb, t0, t1, t2); |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); |
| // MACC(Rm, Rn, t0, t1, t2); |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| // } |
| |
| // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); |
| // MACC(Ra, Rb, t0, t1, t2); |
| // *Pm = Rm = t0 * inv; |
| // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); |
| // MACC(Rm, Rn, t0, t1, t2); |
| |
| // assert(t0 == 0, "broken Montgomery multiply"); |
| |
| // t0 = t1; t1 = t2; t2 = 0; |
| // } |
| |
| // for (i = len; i < 2*len; i++) { |
| // int j; |
| |
| // Pa = Pa_base + i-len; |
| // Pb = Pb_base + len; |
| // Pm = Pm_base + i-len; |
| // Pn = Pn_base + len; |
| |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| |
| // int iters = len*2-i-1; |
| // for (j = i-len+1; iters--; j++) { |
| // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); |
| // MACC(Ra, Rb, t0, t1, t2); |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); |
| // MACC(Rm, Rn, t0, t1, t2); |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| // } |
| |
| // Pm_base[i-len] = t0; |
| // t0 = t1; t1 = t2; t2 = 0; |
| // } |
| |
| // while (t0) |
| // t0 = sub(Pm_base, Pn_base, t0, len); |
| // } |
| |
| /** |
| * Fast Montgomery squaring. This uses asymptotically 25% fewer |
| * multiplies than Montgomery multiplication so it should be up to |
| * 25% faster. However, its loop control is more complex and it |
| * may actually run slower on some machines. |
| * |
| * Arguments: |
| * |
| * Inputs: |
| * c_rarg0 - int array elements a |
| * c_rarg1 - int array elements n (the modulus) |
| * c_rarg2 - int length |
| * c_rarg3 - int inv |
| * c_rarg4 - int array elements m (the result) |
| * |
| */ |
| address generate_square() { |
| Label argh; |
| bind(argh); |
| stop("MontgomeryMultiply total_allocation must be <= 8192"); |
| |
| align(CodeEntryAlignment); |
| address entry = pc(); |
| |
| enter(); |
| |
| // Make room. |
| cmpw(Rlen, 512); |
| br(Assembler::HI, argh); |
| sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); |
| andr(sp, Ra, -2 * wordSize); |
| |
| lsrw(Rlen, Rlen, 1); // length in longwords = len/2 |
| |
| { |
| // Copy input args, reversing as we go. We use Ra as a |
| // temporary variable. |
| reverse(Ra, Pa_base, Rlen, t0, t1); |
| reverse(Ra, Pn_base, Rlen, t0, t1); |
| } |
| |
| // Push all call-saved registers and also Pm_base which we'll need |
| // at the end. |
| save_regs(); |
| |
| mov(Pm_base, Ra); |
| |
| mov(t0, zr); |
| mov(t1, zr); |
| mov(t2, zr); |
| |
| block_comment("for (int i = 0; i < len; i++) {"); |
| mov(Ri, zr); { |
| Label loop, end; |
| bind(loop); |
| cmp(Ri, Rlen); |
| br(Assembler::GE, end); |
| |
| pre1(Ri); |
| |
| block_comment("for (j = (i+1)/2; j; j--) {"); { |
| add(Rj, Ri, 1); |
| lsr(Rj, Rj, 1); |
| unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); |
| } block_comment(" } // j"); |
| |
| last_squaring(Ri); |
| |
| block_comment(" for (j = i/2; j; j--) {"); { |
| lsr(Rj, Ri, 1); |
| unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); |
| } block_comment(" } // j"); |
| |
| post1_squaring(); |
| add(Ri, Ri, 1); |
| cmp(Ri, Rlen); |
| br(Assembler::LT, loop); |
| |
| bind(end); |
| block_comment("} // i"); |
| } |
| |
| block_comment("for (int i = len; i < 2*len; i++) {"); |
| mov(Ri, Rlen); { |
| Label loop, end; |
| bind(loop); |
| cmp(Ri, Rlen, Assembler::LSL, 1); |
| br(Assembler::GE, end); |
| |
| pre2(Ri, Rlen); |
| |
| block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { |
| lsl(Rj, Rlen, 1); |
| sub(Rj, Rj, Ri); |
| sub(Rj, Rj, 1); |
| lsr(Rj, Rj, 1); |
| unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); |
| } block_comment(" } // j"); |
| |
| last_squaring(Ri); |
| |
| block_comment(" for (j = (2*len-i)/2; j; j--) {"); { |
| lsl(Rj, Rlen, 1); |
| sub(Rj, Rj, Ri); |
| lsr(Rj, Rj, 1); |
| unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); |
| } block_comment(" } // j"); |
| |
| post2(Ri, Rlen); |
| add(Ri, Ri, 1); |
| cmp(Ri, Rlen, Assembler::LSL, 1); |
| |
| br(Assembler::LT, loop); |
| bind(end); |
| block_comment("} // i"); |
| } |
| |
| normalize(Rlen); |
| |
| mov(Ra, Pm_base); // Save Pm_base in Ra |
| restore_regs(); // Restore caller's Pm_base |
| |
| // Copy our result into caller's Pm_base |
| reverse(Pm_base, Ra, Rlen, t0, t1); |
| |
| leave(); |
| ret(lr); |
| |
| return entry; |
| } |
| // In C, approximately: |
| |
| // void |
| // montgomery_square(julong Pa_base[], julong Pn_base[], |
| // julong Pm_base[], julong inv, int len) { |
| // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator |
| // julong *Pa, *Pb, *Pn, *Pm; |
| // julong Ra, Rb, Rn, Rm; |
| |
| // int i; |
| |
| // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); |
| |
| // for (i = 0; i < len; i++) { |
| // int j; |
| |
| // Pa = Pa_base; |
| // Pb = Pa_base + i; |
| // Pm = Pm_base; |
| // Pn = Pn_base + i; |
| |
| // Ra = *Pa; |
| // Rb = *Pb; |
| // Rm = *Pm; |
| // Rn = *Pn; |
| |
| // int iters = (i+1)/2; |
| // for (j = 0; iters--; j++) { |
| // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); |
| // MACC2(Ra, Rb, t0, t1, t2); |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); |
| // MACC(Rm, Rn, t0, t1, t2); |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| // } |
| // if ((i & 1) == 0) { |
| // assert(Ra == Pa_base[j], "must be"); |
| // MACC(Ra, Ra, t0, t1, t2); |
| // } |
| // iters = i/2; |
| // assert(iters == i-j, "must be"); |
| // for (; iters--; j++) { |
| // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); |
| // MACC(Rm, Rn, t0, t1, t2); |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| // } |
| |
| // *Pm = Rm = t0 * inv; |
| // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); |
| // MACC(Rm, Rn, t0, t1, t2); |
| |
| // assert(t0 == 0, "broken Montgomery multiply"); |
| |
| // t0 = t1; t1 = t2; t2 = 0; |
| // } |
| |
| // for (i = len; i < 2*len; i++) { |
| // int start = i-len+1; |
| // int end = start + (len - start)/2; |
| // int j; |
| |
| // Pa = Pa_base + i-len; |
| // Pb = Pa_base + len; |
| // Pm = Pm_base + i-len; |
| // Pn = Pn_base + len; |
| |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| |
| // int iters = (2*len-i-1)/2; |
| // assert(iters == end-start, "must be"); |
| // for (j = start; iters--; j++) { |
| // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); |
| // MACC2(Ra, Rb, t0, t1, t2); |
| // Ra = *++Pa; |
| // Rb = *--Pb; |
| // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); |
| // MACC(Rm, Rn, t0, t1, t2); |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| // } |
| // if ((i & 1) == 0) { |
| // assert(Ra == Pa_base[j], "must be"); |
| // MACC(Ra, Ra, t0, t1, t2); |
| // } |
| // iters = (2*len-i)/2; |
| // assert(iters == len-j, "must be"); |
| // for (; iters--; j++) { |
| // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); |
| // MACC(Rm, Rn, t0, t1, t2); |
| // Rm = *++Pm; |
| // Rn = *--Pn; |
| // } |
| // Pm_base[i-len] = t0; |
| // t0 = t1; t1 = t2; t2 = 0; |
| // } |
| |
| // while (t0) |
| // t0 = sub(Pm_base, Pn_base, t0, len); |
| // } |
| }; |
| |
| |
| // Initialization |
| void generate_initial_stubs() { |
| // Generate initial stubs and initializes the entry points |
| |
| // entry points that exist in all platforms Note: This is code |
| // that could be shared among different platforms - however the |
| // benefit seems to be smaller than the disadvantage of having a |
| // much more complicated generator structure. See also comment in |
| // stubRoutines.hpp. |
| |
| StubRoutines::_forward_exception_entry = generate_forward_exception(); |
| |
| StubRoutines::_call_stub_entry = |
| generate_call_stub(StubRoutines::_call_stub_return_address); |
| |
| // is referenced by megamorphic call |
| StubRoutines::_catch_exception_entry = generate_catch_exception(); |
| |
| // Build this early so it's available for the interpreter. |
| StubRoutines::_throw_StackOverflowError_entry = |
| generate_throw_exception("StackOverflowError throw_exception", |
| CAST_FROM_FN_PTR(address, |
| SharedRuntime::throw_StackOverflowError)); |
| StubRoutines::_throw_delayed_StackOverflowError_entry = |
| generate_throw_exception("delayed StackOverflowError throw_exception", |
| CAST_FROM_FN_PTR(address, |
| SharedRuntime::throw_delayed_StackOverflowError)); |
| |
| // Initialize table for copy memory (arraycopy) check. |
| if (UnsafeCopyMemory::_table == nullptr) { |
| UnsafeCopyMemory::create_table(8); |
| } |
| |
| if (UseCRC32Intrinsics) { |
| // set table address before stub generation which use it |
| StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; |
| StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); |
| } |
| |
| if (UseCRC32CIntrinsics) { |
| StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); |
| } |
| |
| // Disabled until JDK-8210858 is fixed |
| // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { |
| // StubRoutines::_dlog = generate_dlog(); |
| // } |
| |
| if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { |
| StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); |
| } |
| |
| if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { |
| StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); |
| } |
| } |
| |
| void generate_continuation_stubs() { |
| // Continuation stubs: |
| StubRoutines::_cont_thaw = generate_cont_thaw(); |
| StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); |
| StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); |
| |
| JFR_ONLY(generate_jfr_stubs();) |
| } |
| |
| #if INCLUDE_JFR |
| void generate_jfr_stubs() { |
| StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint(); |
| StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point(); |
| StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease(); |
| StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point(); |
| } |
| #endif // INCLUDE_JFR |
| |
| void generate_final_stubs() { |
| // support for verify_oop (must happen after universe_init) |
| if (VerifyOops) { |
| StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); |
| } |
| StubRoutines::_throw_AbstractMethodError_entry = |
| generate_throw_exception("AbstractMethodError throw_exception", |
| CAST_FROM_FN_PTR(address, |
| SharedRuntime:: |
| throw_AbstractMethodError)); |
| |
| StubRoutines::_throw_IncompatibleClassChangeError_entry = |
| generate_throw_exception("IncompatibleClassChangeError throw_exception", |
| CAST_FROM_FN_PTR(address, |
| SharedRuntime:: |
| throw_IncompatibleClassChangeError)); |
| |
| StubRoutines::_throw_NullPointerException_at_call_entry = |
| generate_throw_exception("NullPointerException at call throw_exception", |
| CAST_FROM_FN_PTR(address, |
| SharedRuntime:: |
| throw_NullPointerException_at_call)); |
| |
| // arraycopy stubs used by compilers |
| generate_arraycopy_stubs(); |
| |
| BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); |
| if (bs_nm != nullptr) { |
| StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); |
| } |
| |
| StubRoutines::aarch64::_spin_wait = generate_spin_wait(); |
| |
| if (UsePoly1305Intrinsics) { |
| StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); |
| } |
| |
| #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) |
| |
| generate_atomic_entry_points(); |
| |
| #endif // LINUX |
| |
| StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated |
| } |
| |
| void generate_compiler_stubs() { |
| #if COMPILER2_OR_JVMCI |
| |
| if (UseSVE == 0) { |
| StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); |
| } |
| |
| // array equals stub for large arrays. |
| if (!UseSimpleArrayEquals) { |
| StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); |
| } |
| |
| // byte_array_inflate stub for large arrays. |
| StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); |
| |
| // countPositives stub for large arrays. |
| StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); |
| |
| generate_compare_long_strings(); |
| |
| generate_string_indexof_stubs(); |
| |
| #ifdef COMPILER2 |
| if (UseMultiplyToLenIntrinsic) { |
| StubRoutines::_multiplyToLen = generate_multiplyToLen(); |
| } |
| |
| if (UseSquareToLenIntrinsic) { |
| StubRoutines::_squareToLen = generate_squareToLen(); |
| } |
| |
| if (UseMulAddIntrinsic) { |
| StubRoutines::_mulAdd = generate_mulAdd(); |
| } |
| |
| if (UseSIMDForBigIntegerShiftIntrinsics) { |
| StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); |
| StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); |
| } |
| |
| if (UseMontgomeryMultiplyIntrinsic) { |
| StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); |
| MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); |
| StubRoutines::_montgomeryMultiply = g.generate_multiply(); |
| } |
| |
| if (UseMontgomerySquareIntrinsic) { |
| StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); |
| MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); |
| // We use generate_multiply() rather than generate_square() |
| // because it's faster for the sizes of modulus we care about. |
| StubRoutines::_montgomerySquare = g.generate_multiply(); |
| } |
| #endif // COMPILER2 |
| |
| if (UseChaCha20Intrinsics) { |
| StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); |
| } |
| |
| if (UseBASE64Intrinsics) { |
| StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); |
| StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); |
| } |
| |
| // data cache line writeback |
| StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); |
| StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); |
| |
| if (UseAESIntrinsics) { |
| StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); |
| StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); |
| StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); |
| StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); |
| StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); |
| } |
| if (UseGHASHIntrinsics) { |
| // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); |
| StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); |
| } |
| if (UseAESIntrinsics && UseGHASHIntrinsics) { |
| StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); |
| } |
| |
| if (UseMD5Intrinsics) { |
| StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); |
| StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); |
| } |
| if (UseSHA1Intrinsics) { |
| StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); |
| StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); |
| } |
| if (UseSHA256Intrinsics) { |
| StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); |
| StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); |
| } |
| if (UseSHA512Intrinsics) { |
| StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); |
| StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); |
| } |
| if (UseSHA3Intrinsics) { |
| StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); |
| StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); |
| } |
| |
| // generate Adler32 intrinsics code |
| if (UseAdler32Intrinsics) { |
| StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); |
| } |
| #endif // COMPILER2_OR_JVMCI |
| } |
| |
| public: |
| StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { |
| switch(kind) { |
| case Initial_stubs: |
| generate_initial_stubs(); |
| break; |
| case Continuation_stubs: |
| generate_continuation_stubs(); |
| break; |
| case Compiler_stubs: |
| generate_compiler_stubs(); |
| break; |
| case Final_stubs: |
| generate_final_stubs(); |
| break; |
| default: |
| fatal("unexpected stubs kind: %d", kind); |
| break; |
| }; |
| } |
| }; // end class declaration |
| |
| void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { |
| StubGenerator g(code, kind); |
| } |
| |
| |
| #if defined (LINUX) |
| |
| // Define pointers to atomic stubs and initialize them to point to the |
| // code in atomic_aarch64.S. |
| |
| #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ |
| extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ |
| (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ |
| aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ |
| = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; |
| |
| DEFAULT_ATOMIC_OP(fetch_add, 4, ) |
| DEFAULT_ATOMIC_OP(fetch_add, 8, ) |
| DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) |
| DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) |
| DEFAULT_ATOMIC_OP(xchg, 4, ) |
| DEFAULT_ATOMIC_OP(xchg, 8, ) |
| DEFAULT_ATOMIC_OP(cmpxchg, 1, ) |
| DEFAULT_ATOMIC_OP(cmpxchg, 4, ) |
| DEFAULT_ATOMIC_OP(cmpxchg, 8, ) |
| DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) |
| DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) |
| DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) |
| DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) |
| DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) |
| DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) |
| DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) |
| |
| #undef DEFAULT_ATOMIC_OP |
| |
| #endif // LINUX |