| // Copyright 2019, VIXL authors |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are met: |
| // |
| // * Redistributions of source code must retain the above copyright notice, |
| // this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above copyright notice, |
| // this list of conditions and the following disclaimer in the documentation |
| // and/or other materials provided with the distribution. |
| // * Neither the name of ARM Limited nor the names of its contributors may be |
| // used to endorse or promote products derived from this software without |
| // specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND |
| // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE |
| // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "bench-utils.h" |
| |
| #include <vector> |
| |
| #include "globals-vixl.h" |
| |
| #include "aarch64/macro-assembler-aarch64.h" |
| |
| using namespace vixl; |
| using namespace vixl::aarch64; |
| |
| #define __ masm_-> |
| |
| const Register BenchCodeGenerator::scratch = x28; |
| |
| Register BenchCodeGenerator::PickR(unsigned size_in_bits) { |
| // Only select caller-saved registers [x0, x15]. |
| return Register(static_cast<unsigned>(GetRandomBits(4)), size_in_bits); |
| } |
| |
| VRegister BenchCodeGenerator::PickV(unsigned size_in_bits) { |
| // Only select caller-saved registers [v0, v7] or [v16, v31]. |
| // The resulting distribution is not uniform. |
| unsigned code = static_cast<unsigned>(GetRandomBits(5)); |
| if (code < 16) code &= 0x7; // [v8, v15] -> [v0, v7] |
| return VRegister(code, size_in_bits); |
| } |
| |
| uint64_t BenchCodeGenerator::GetRandomBits(int bits) { |
| VIXL_ASSERT((bits >= 0) && (bits <= 64)); |
| uint64_t result = 0; |
| |
| while (bits >= 32) { |
| // For big chunks, call jrand48 directly. |
| result = (result << 32) | jrand48(rand_state_); // [-2^31, 2^31] |
| bits -= 32; |
| } |
| if (bits == 0) return result; |
| |
| // We often only want a few bits at a time, so use stored entropy to avoid |
| // frequent calls to jrand48. |
| |
| if (bits > rnd_bits_) { |
| // We want more bits than we have. |
| result = (result << rnd_bits_) | rnd_; |
| bits -= rnd_bits_; |
| |
| rnd_ = static_cast<uint32_t>(jrand48(rand_state_)); // [-2^31, 2^31] |
| rnd_bits_ = 32; |
| } |
| |
| VIXL_ASSERT(bits <= rnd_bits_); |
| result = (result << bits) | (rnd_ % (UINT32_C(1) << bits)); |
| rnd_ >>= bits; |
| rnd_bits_ -= bits; |
| return result; |
| } |
| |
| unsigned BenchCodeGenerator::PickRSize() { |
| return PickBool() ? kWRegSize : kXRegSize; |
| } |
| |
| unsigned BenchCodeGenerator::PickFPSize() { |
| uint64_t entropy = GetRandomBits(4); |
| // Doubles and floats are common in most languages, so use half-precision |
| // types only rarely. |
| if (entropy == 0) return kHRegSize; |
| return ((entropy & 1) == 0) ? kSRegSize : kDRegSize; |
| } |
| |
| void BenchCodeGenerator::Generate(size_t min_size_in_bytes) { |
| Label start; |
| __ Bind(&start); |
| |
| call_depth_++; |
| GeneratePrologue(); |
| |
| while (masm_->GetSizeOfCodeGeneratedSince(&start) < min_size_in_bytes) { |
| GenerateArbitrarySequence(); |
| } |
| |
| GenerateEpilogue(); |
| call_depth_--; |
| |
| // Make sure that any labels (created by GenerateBranchSequence) are bound |
| // before we exit. |
| if (call_depth_ == 0) BindAllPendingLabels(); |
| } |
| |
| void BenchCodeGenerator::GeneratePrologue() { |
| // Construct a normal frame. |
| VIXL_ASSERT(masm_->StackPointer().Is(sp)); |
| __ Push(lr, x29); // x29 is the frame pointer (fp). |
| __ Mov(x29, sp); |
| VIXL_ASSERT(call_depth_ > 0); |
| if (call_depth_ == 1) { |
| __ Push(scratch, xzr); |
| // Claim space to use for load and stores. |
| // - We need at least 4 * kQRegSize bytes for Ld4/St4. |
| // - The architecture requires that we allocate a multiple of 16 bytes. |
| // - There is no hard upper limit, but the Simulator has a limited stack |
| // space. |
| __ Claim((4 * kQRegSize) + (16 * GetRandomBits(3))); |
| __ Mov(scratch, sp); |
| } |
| } |
| |
| void BenchCodeGenerator::GenerateEpilogue() { |
| VIXL_ASSERT(call_depth_ > 0); |
| if (call_depth_ == 1) { |
| __ Sub(sp, x29, 2 * kXRegSizeInBytes); // Drop the scratch space. |
| __ Pop(xzr, scratch); |
| } |
| __ Pop(x29, lr); |
| __ Ret(); |
| } |
| |
| void BenchCodeGenerator::GenerateArbitrarySequence() { |
| // Bind pending labels, and remove them from the list. |
| // Recently-linked labels are much more likely to be bound than old ones. This |
| // should produce a mix of long- (veneered) and short-range branches. |
| uint32_t bind_mask = static_cast<uint32_t>( |
| GetRandomBits(8) | (GetRandomBits(7) << 1) | (GetRandomBits(6) << 2)); |
| BindPendingLabels(bind_mask); |
| |
| // If we are at the top call level (call_depth_ == 1), generate nested calls |
| // 1/4 of the time, and halve the chance for each call level below that. |
| VIXL_ASSERT(call_depth_ > 0); |
| if (GetRandomBits(call_depth_ + 1) == 0) { |
| GenerateCallReturnSequence(); |
| return; |
| } |
| |
| // These weightings should be roughly representative of real functions. |
| switch (GetRandomBits(4)) { |
| case 0x0: |
| case 0x1: |
| GenerateTrivialSequence(); |
| return; |
| case 0x2: |
| case 0x3: |
| case 0x4: |
| case 0x5: |
| GenerateOperandSequence(); |
| return; |
| case 0x6: |
| case 0x7: |
| case 0x8: |
| GenerateMemOperandSequence(); |
| return; |
| case 0xb: |
| case 0x9: |
| case 0xa: |
| GenerateImmediateSequence(); |
| return; |
| case 0xc: |
| case 0xd: |
| GenerateBranchSequence(); |
| return; |
| case 0xe: |
| GenerateFPSequence(); |
| return; |
| case 0xf: |
| GenerateNEONSequence(); |
| return; |
| } |
| } |
| |
| void BenchCodeGenerator::GenerateTrivialSequence() { |
| unsigned size = PickRSize(); |
| __ Asr(PickR(size), PickR(size), 4); |
| __ Bfi(PickR(size), PickR(size), 5, 14); |
| __ Bfc(PickR(size), 5, 14); |
| __ Cinc(PickR(size), PickR(size), ge); |
| __ Cinv(PickR(size), PickR(size), ne); |
| __ Cls(PickR(size), PickR(size)); |
| __ Cneg(PickR(size), PickR(size), lt); |
| __ Mrs(PickX(), NZCV); |
| __ Nop(); |
| __ Mul(PickR(size), PickR(size), PickR(size)); |
| __ Rbit(PickR(size), PickR(size)); |
| __ Rev(PickR(size), PickR(size)); |
| __ Sdiv(PickR(size), PickR(size), PickR(size)); |
| if (!labels_.empty()) { |
| __ Adr(PickX(), labels_.begin()->target); |
| } |
| } |
| |
| void BenchCodeGenerator::GenerateOperandSequence() { |
| unsigned size = PickRSize(); |
| // The cast to Operand is normally implicit for simple registers, but we |
| // explicitly specify it in every case here to ensure that the benchmark does |
| // what we expect. |
| __ And(PickR(size), PickR(size), Operand(PickR(size))); |
| __ Bics(PickR(size), PickR(size), Operand(PickR(size))); |
| __ Orr(PickR(size), PickR(size), Operand(PickR(size))); |
| __ Eor(PickR(size), PickR(size), Operand(PickR(size))); |
| __ Tst(PickR(size), Operand(PickR(size))); |
| __ Eon(PickR(size), PickR(size), Operand(PickR(size))); |
| __ Cmp(PickR(size), Operand(PickR(size))); |
| __ Negs(PickR(size), Operand(PickR(size))); |
| __ Mvn(PickR(size), Operand(PickR(size))); |
| __ Ccmp(PickR(size), Operand(PickR(size)), NoFlag, eq); |
| __ Ccmn(PickR(size), Operand(PickR(size)), NoFlag, eq); |
| __ Csel(PickR(size), Operand(PickR(size)), Operand(PickR(size)), lt); |
| { |
| // Ensure that `claim` doesn't alias any PickR(). |
| UseScratchRegisterScope temps(masm_); |
| Register claim = temps.AcquireX(); |
| // We should only claim a 16-byte-aligned amount, since we're using the |
| // system stack pointer. |
| __ Mov(claim, GetRandomBits(4) * 16); |
| __ Claim(Operand(claim)); |
| // Also claim a bit more, so we can store at sp+claim. |
| __ Claim(Operand(32)); |
| __ Poke(PickR(size), Operand(claim)); |
| __ Peek(PickR(size), Operand(8)); |
| __ Poke(PickR(size), Operand(16)); |
| __ Peek(PickR(size), Operand(claim.W(), UXTW)); |
| __ Drop(Operand(32)); |
| __ Drop(Operand(claim)); |
| } |
| } |
| |
| void BenchCodeGenerator::GenerateMemOperandSequence() { |
| unsigned size = PickRSize(); |
| RegList store_list = GetRandomBits(16); // Restrict to [x0, x15]. |
| __ StoreCPURegList(CPURegList(CPURegister::kRegister, size, store_list), |
| MemOperand(scratch)); |
| RegList load_list = GetRandomBits(16); // Restrict to [x0, x15]. |
| __ LoadCPURegList(CPURegList(CPURegister::kRegister, size, load_list), |
| MemOperand(scratch)); |
| __ Str(PickX(), MemOperand(scratch)); |
| __ Strb(PickW(), MemOperand(scratch, 42)); |
| __ Strh(PickW(), MemOperand(scratch, 42, PostIndex)); |
| __ Ldrsw(PickX(), MemOperand(scratch, -42, PreIndex)); |
| __ Ldr(PickR(size), MemOperand(scratch, 19)); // Translated to ldur. |
| __ Push(PickX(), PickX()); |
| // Ensure unique registers (in [x0, x15]) for Pop. |
| __ Pop(Register(static_cast<int>(GetRandomBits(2)) + 0, kWRegSize), |
| Register(static_cast<int>(GetRandomBits(2)) + 4, kWRegSize), |
| Register(static_cast<int>(GetRandomBits(2)) + 8, kWRegSize), |
| Register(static_cast<int>(GetRandomBits(2)) + 12, kWRegSize)); |
| } |
| |
| void BenchCodeGenerator::GenerateImmediateSequence() { |
| unsigned size = PickRSize(); |
| __ And(PickR(size), PickR(size), GetRandomBits(size)); |
| __ Sub(PickR(size), PickR(size), GetRandomBits(size)); |
| __ Mov(PickR(size), GetRandomBits(size)); |
| __ Movk(PickX(), GetRandomBits(16), static_cast<int>(GetRandomBits(2)) * 16); |
| } |
| |
| void BenchCodeGenerator::BindPendingLabels(uint64_t bind_mask) { |
| if (bind_mask == 0) return; |
| // The labels we bind here jump back to just after each branch that refers |
| // to them. This allows a simple, linear execution path, whilst still |
| // benchmarking long-range labels. |
| // |
| // Ensure that code falling through into this sequence does not jump |
| // back to an earlier point in the execution path. |
| Label done; |
| __ B(&done); |
| |
| std::list<LabelPair>::iterator it = labels_.begin(); |
| while ((it != labels_.end()) && (bind_mask != 0)) { |
| if ((bind_mask & 1) != 0) { |
| // Bind the label and jump back to its source. |
| __ Bind(it->target); |
| __ B(it->cont); |
| delete it->target; |
| delete it->cont; |
| it = labels_.erase(it); |
| } else { |
| ++it; // Don't bind this one. |
| } |
| bind_mask >>= 1; |
| } |
| __ Bind(&done); |
| } |
| |
| void BenchCodeGenerator::BindAllPendingLabels() { |
| while (!labels_.empty()) { |
| // BindPendingLabels generates a branch over each block of bound labels. |
| // This will be repeated for each call here, but the effect is minimal and |
| // (empirically) we rarely accumulate more than 64 pending labels anyway. |
| BindPendingLabels(UINT64_MAX); |
| } |
| } |
| |
| void BenchCodeGenerator::GenerateBranchSequence() { |
| { |
| LabelPair pair = {new Label(), new Label()}; |
| __ B(lt, pair.target); |
| __ Bind(pair.cont); |
| labels_.push_front(pair); |
| } |
| |
| { |
| LabelPair pair = {new Label(), new Label()}; |
| __ Tbz(PickX(), |
| static_cast<int>(GetRandomBits(kXRegSizeLog2)), |
| pair.target); |
| __ Bind(pair.cont); |
| labels_.push_front(pair); |
| } |
| |
| { |
| LabelPair pair = {new Label(), new Label()}; |
| __ Cbz(PickX(), pair.target); |
| __ Bind(pair.cont); |
| labels_.push_front(pair); |
| } |
| } |
| |
| void BenchCodeGenerator::GenerateCallReturnSequence() { |
| Label fn, done; |
| |
| if (PickBool()) { |
| __ Bl(&fn); |
| } else { |
| Register reg = PickX(); |
| __ Adr(reg, &fn); |
| __ Blr(reg); |
| } |
| __ B(&done); |
| |
| __ Bind(&fn); |
| // Recurse with a randomised (but fairly small) minimum size. |
| Generate(GetRandomBits(8)); |
| |
| __ Bind(&done); |
| } |
| |
| void BenchCodeGenerator::GenerateFPSequence() { |
| unsigned size = PickFPSize(); |
| unsigned other_size = PickBool() ? size * 2 : size / 2; |
| if (other_size < kHRegSize) other_size = kDRegSize; |
| if (other_size > kDRegSize) other_size = kHRegSize; |
| |
| __ Fadd(PickV(size), PickV(size), PickV(size)); |
| __ Fmul(PickV(size), PickV(size), PickV(size)); |
| __ Fcvt(PickV(other_size), PickV(size)); |
| __ Fjcvtzs(PickW(), PickD()); |
| __ Fccmp(PickV(size), PickV(size), NCVFlag, pl); |
| __ Fdiv(PickV(size), PickV(size), PickV(size)); |
| __ Fmov(PickV(size), 1.25 * GetRandomBits(2)); |
| __ Fmsub(PickV(size), PickV(size), PickV(size), PickV(size)); |
| __ Frintn(PickV(size), PickV(size)); |
| } |
| |
| void BenchCodeGenerator::GenerateNEONSequence() { |
| __ And(PickV().V16B(), PickV().V16B(), PickV().V16B()); |
| __ Sqrshl(PickV().V8H(), PickV().V8H(), PickV().V8H()); |
| __ Umull(PickV().V2D(), PickV().V2S(), PickV().V2S()); |
| __ Sqdmlal2(PickV().V4S(), PickV().V8H(), PickV().V8H()); |
| |
| // For structured loads and stores, we have to specify sequential (wrapped) |
| // registers, so start with [v16, v31] and allow them to wrap in to the |
| // [v0, v7] range. |
| VRegister vt(16 + static_cast<unsigned>(GetRandomBits(4)), kQRegSize); |
| VRegister vt2((vt.GetCode() + 1) % kNumberOfVRegisters, kQRegSize); |
| VRegister vt3((vt.GetCode() + 2) % kNumberOfVRegisters, kQRegSize); |
| VRegister vt4((vt.GetCode() + 3) % kNumberOfVRegisters, kQRegSize); |
| VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt)); |
| VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt2)); |
| VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt3)); |
| VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt4)); |
| __ Ld3(vt.V4S(), vt2.V4S(), vt3.V4S(), MemOperand(scratch)); |
| __ St4(vt.V16B(), vt2.V16B(), vt3.V16B(), vt4.V16B(), MemOperand(scratch)); |
| |
| __ Fmaxv(PickV().H(), PickV().V8H()); |
| __ Fminp(PickV().V4S(), PickV().V4S(), PickV().V4S()); |
| } |