// Copyright 2019, VIXL authors // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of ARM Limited nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "globals-vixl.h" #include "aarch64/macro-assembler-aarch64.h" #include "bench-utils.h" using namespace vixl; using namespace vixl::aarch64; #define __ masm_-> const Register BenchCodeGenerator::scratch = x28; Register BenchCodeGenerator::PickR(unsigned size_in_bits) { // Only select caller-saved registers [x0, x15]. return Register(static_cast(GetRandomBits(4)), size_in_bits); } VRegister BenchCodeGenerator::PickV(unsigned size_in_bits) { // Only select caller-saved registers [v0, v7] or [v16, v31]. // The resulting distribution is not uniform. unsigned code = static_cast(GetRandomBits(5)); if (code < 16) code &= 0x7; // [v8, v15] -> [v0, v7] return VRegister(code, size_in_bits); } uint64_t BenchCodeGenerator::GetRandomBits(int bits) { VIXL_ASSERT((bits >= 0) && (bits <= 64)); uint64_t result = 0; while (bits >= 32) { // For big chunks, call jrand48 directly. result = (result << 32) | jrand48(rand_state_); // [-2^31, 2^31] bits -= 32; } if (bits == 0) return result; // We often only want a few bits at a time, so use stored entropy to avoid // frequent calls to jrand48. if (bits > rnd_bits_) { // We want more bits than we have. result = (result << rnd_bits_) | rnd_; bits -= rnd_bits_; rnd_ = static_cast(jrand48(rand_state_)); // [-2^31, 2^31] rnd_bits_ = 32; } VIXL_ASSERT(bits <= rnd_bits_); result = (result << bits) | (rnd_ % (UINT32_C(1) << bits)); rnd_ >>= bits; rnd_bits_ -= bits; return result; } unsigned BenchCodeGenerator::PickRSize() { return PickBool() ? kWRegSize : kXRegSize; } unsigned BenchCodeGenerator::PickFPSize() { uint64_t entropy = GetRandomBits(4); // Doubles and floats are common in most languages, so use half-precision // types only rarely. if (entropy == 0) return kHRegSize; return ((entropy & 1) == 0) ? kSRegSize : kDRegSize; } void BenchCodeGenerator::Generate(size_t min_size_in_bytes) { Label start; __ Bind(&start); call_depth_++; GeneratePrologue(); while (masm_->GetSizeOfCodeGeneratedSince(&start) < min_size_in_bytes) { GenerateArbitrarySequence(); } GenerateEpilogue(); call_depth_--; // Make sure that any labels (created by GenerateBranchSequence) are bound // before we exit. if (call_depth_ == 0) BindAllPendingLabels(); } void BenchCodeGenerator::GeneratePrologue() { // Construct a normal frame. VIXL_ASSERT(masm_->StackPointer().Is(sp)); __ Push(lr, x29); // x29 is the frame pointer (fp). __ Mov(x29, sp); VIXL_ASSERT(call_depth_ > 0); if (call_depth_ == 1) { __ Push(scratch, xzr); // Claim space to use for load and stores. // - We need at least 4 * kQRegSize bytes for Ld4/St4. // - The architecture requires that we allocate a multiple of 16 bytes. // - There is no hard upper limit, but the Simulator has a limited stack // space. __ Claim((4 * kQRegSize) + (16 * GetRandomBits(3))); __ Mov(scratch, sp); } } void BenchCodeGenerator::GenerateEpilogue() { VIXL_ASSERT(call_depth_ > 0); if (call_depth_ == 1) { __ Sub(sp, x29, 2 * kXRegSizeInBytes); // Drop the scratch space. __ Pop(xzr, scratch); } __ Pop(x29, lr); __ Ret(); } void BenchCodeGenerator::GenerateArbitrarySequence() { // Bind pending labels, and remove them from the list. // Recently-linked labels are much more likely to be bound than old ones. This // should produce a mix of long- (veneered) and short-range branches. uint32_t bind_mask = static_cast( GetRandomBits(8) | (GetRandomBits(7) << 1) | (GetRandomBits(6) << 2)); BindPendingLabels(bind_mask); // If we are at the top call level (call_depth_ == 1), generate nested calls // 1/4 of the time, and halve the chance for each call level below that. VIXL_ASSERT(call_depth_ > 0); if (GetRandomBits(call_depth_ + 1) == 0) { GenerateCallReturnSequence(); return; } // These weightings should be roughly representative of real functions. switch (GetRandomBits(4)) { case 0x0: case 0x1: GenerateTrivialSequence(); return; case 0x2: case 0x3: case 0x4: case 0x5: GenerateOperandSequence(); return; case 0x6: case 0x7: case 0x8: GenerateMemOperandSequence(); return; case 0xb: case 0x9: case 0xa: GenerateImmediateSequence(); return; case 0xc: case 0xd: GenerateBranchSequence(); return; case 0xe: GenerateFPSequence(); return; case 0xf: GenerateNEONSequence(); return; } } void BenchCodeGenerator::GenerateTrivialSequence() { unsigned size = PickRSize(); __ Asr(PickR(size), PickR(size), 4); __ Bfi(PickR(size), PickR(size), 5, 14); __ Bfc(PickR(size), 5, 14); __ Cinc(PickR(size), PickR(size), ge); __ Cinv(PickR(size), PickR(size), ne); __ Cls(PickR(size), PickR(size)); __ Cneg(PickR(size), PickR(size), lt); __ Mrs(PickX(), NZCV); __ Nop(); __ Mul(PickR(size), PickR(size), PickR(size)); __ Rbit(PickR(size), PickR(size)); __ Rev(PickR(size), PickR(size)); __ Sdiv(PickR(size), PickR(size), PickR(size)); if (!labels_.empty()) { __ Adr(PickX(), labels_.begin()->target); } } void BenchCodeGenerator::GenerateOperandSequence() { unsigned size = PickRSize(); // The cast to Operand is normally implicit for simple registers, but we // explicitly specify it in every case here to ensure that the benchmark does // what we expect. __ And(PickR(size), PickR(size), Operand(PickR(size))); __ Bics(PickR(size), PickR(size), Operand(PickR(size))); __ Orr(PickR(size), PickR(size), Operand(PickR(size))); __ Eor(PickR(size), PickR(size), Operand(PickR(size))); __ Tst(PickR(size), Operand(PickR(size))); __ Eon(PickR(size), PickR(size), Operand(PickR(size))); __ Cmp(PickR(size), Operand(PickR(size))); __ Negs(PickR(size), Operand(PickR(size))); __ Mvn(PickR(size), Operand(PickR(size))); __ Ccmp(PickR(size), Operand(PickR(size)), NoFlag, eq); __ Ccmn(PickR(size), Operand(PickR(size)), NoFlag, eq); __ Csel(PickR(size), Operand(PickR(size)), Operand(PickR(size)), lt); { // Ensure that `claim` doesn't alias any PickR(). UseScratchRegisterScope temps(masm_); Register claim = temps.AcquireX(); // We should only claim a 16-byte-aligned amount, since we're using the // system stack pointer. __ Mov(claim, GetRandomBits(4) * 16); __ Claim(Operand(claim)); // Also claim a bit more, so we can store at sp+claim. __ Claim(Operand(32)); __ Poke(PickR(size), Operand(claim)); __ Peek(PickR(size), Operand(8)); __ Poke(PickR(size), Operand(16)); __ Peek(PickR(size), Operand(claim.W(), UXTW)); __ Drop(Operand(32)); __ Drop(Operand(claim)); } } void BenchCodeGenerator::GenerateMemOperandSequence() { unsigned size = PickRSize(); RegList store_list = GetRandomBits(16); // Restrict to [x0, x15]. __ StoreCPURegList(CPURegList(CPURegister::kRegister, size, store_list), MemOperand(scratch)); RegList load_list = GetRandomBits(16); // Restrict to [x0, x15]. __ LoadCPURegList(CPURegList(CPURegister::kRegister, size, load_list), MemOperand(scratch)); __ Str(PickX(), MemOperand(scratch)); __ Strb(PickW(), MemOperand(scratch, 42)); __ Strh(PickW(), MemOperand(scratch, 42, PostIndex)); __ Ldrsw(PickX(), MemOperand(scratch, -42, PreIndex)); __ Ldr(PickR(size), MemOperand(scratch, 19)); // Translated to ldur. __ Push(PickX(), PickX()); // Ensure unique registers (in [x0, x15]) for Pop. __ Pop(Register(static_cast(GetRandomBits(2)) + 0, kWRegSize), Register(static_cast(GetRandomBits(2)) + 4, kWRegSize), Register(static_cast(GetRandomBits(2)) + 8, kWRegSize), Register(static_cast(GetRandomBits(2)) + 12, kWRegSize)); } void BenchCodeGenerator::GenerateImmediateSequence() { unsigned size = PickRSize(); __ And(PickR(size), PickR(size), GetRandomBits(size)); __ Sub(PickR(size), PickR(size), GetRandomBits(size)); __ Mov(PickR(size), GetRandomBits(size)); __ Movk(PickX(), GetRandomBits(16), static_cast(GetRandomBits(2)) * 16); } void BenchCodeGenerator::BindPendingLabels(uint64_t bind_mask) { if (bind_mask == 0) return; // The labels we bind here jump back to just after each branch that refers // to them. This allows a simple, linear execution path, whilst still // benchmarking long-range labels. // // Ensure that code falling through into this sequence does not jump // back to an earlier point in the execution path. Label done; __ B(&done); std::list::iterator it = labels_.begin(); while ((it != labels_.end()) && (bind_mask != 0)) { if ((bind_mask & 1) != 0) { // Bind the label and jump back to its source. __ Bind(it->target); __ B(it->cont); delete it->target; delete it->cont; it = labels_.erase(it); } else { ++it; // Don't bind this one. } bind_mask >>= 1; } __ Bind(&done); } void BenchCodeGenerator::BindAllPendingLabels() { while (!labels_.empty()) { // BindPendingLables generates a branch over each block of bound labels. // This will be repeated for each call here, but the effect is minimal and // (empirically) we rarely accumulate more than 64 pending labels anyway. BindPendingLabels(UINT64_MAX); } } void BenchCodeGenerator::GenerateBranchSequence() { { LabelPair pair = {new Label(), new Label()}; __ B(lt, pair.target); __ Bind(pair.cont); labels_.push_front(pair); } { LabelPair pair = {new Label(), new Label()}; __ Tbz(PickX(), static_cast(GetRandomBits(kXRegSizeLog2)), pair.target); __ Bind(pair.cont); labels_.push_front(pair); } { LabelPair pair = {new Label(), new Label()}; __ Cbz(PickX(), pair.target); __ Bind(pair.cont); labels_.push_front(pair); } } void BenchCodeGenerator::GenerateCallReturnSequence() { Label fn, done; if (PickBool()) { __ Bl(&fn); } else { Register reg = PickX(); __ Adr(reg, &fn); __ Blr(reg); } __ B(&done); __ Bind(&fn); // Recurse with a randomised (but fairly small) minimum size. Generate(GetRandomBits(8)); __ Bind(&done); } void BenchCodeGenerator::GenerateFPSequence() { unsigned size = PickFPSize(); unsigned other_size = PickBool() ? size * 2 : size / 2; if (other_size < kHRegSize) other_size = kDRegSize; if (other_size > kDRegSize) other_size = kHRegSize; __ Fadd(PickV(size), PickV(size), PickV(size)); __ Fmul(PickV(size), PickV(size), PickV(size)); __ Fcvt(PickV(other_size), PickV(size)); __ Fjcvtzs(PickW(), PickD()); __ Fccmp(PickV(size), PickV(size), NCVFlag, pl); __ Fdiv(PickV(size), PickV(size), PickV(size)); __ Fmov(PickV(size), 1.25 * GetRandomBits(2)); __ Fmsub(PickV(size), PickV(size), PickV(size), PickV(size)); __ Frintn(PickV(size), PickV(size)); } void BenchCodeGenerator::GenerateNEONSequence() { __ And(PickV().V16B(), PickV().V16B(), PickV().V16B()); __ Sqrshl(PickV().V8H(), PickV().V8H(), PickV().V8H()); __ Umull(PickV().V2D(), PickV().V2S(), PickV().V2S()); __ Sqdmlal2(PickV().V4S(), PickV().V8H(), PickV().V8H()); // For structured loads and stores, we have to specify sequential (wrapped) // registers, so start with [v16, v31] and allow them to wrap in to the // [v0, v7] range. VRegister vt(16 + static_cast(GetRandomBits(4)), kQRegSize); VRegister vt2((vt.GetCode() + 1) % kNumberOfVRegisters, kQRegSize); VRegister vt3((vt.GetCode() + 2) % kNumberOfVRegisters, kQRegSize); VRegister vt4((vt.GetCode() + 3) % kNumberOfVRegisters, kQRegSize); VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt)); VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt2)); VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt3)); VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt4)); __ Ld3(vt.V4S(), vt2.V4S(), vt3.V4S(), MemOperand(scratch)); __ St4(vt.V16B(), vt2.V16B(), vt3.V16B(), vt4.V16B(), MemOperand(scratch)); __ Fmaxv(PickV().H(), PickV().V8H()); __ Fminp(PickV().V4S(), PickV().V4S(), PickV().V4S()); }