/* * Copyright (C) 2011 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* This file contains codegen for the Thumb2 ISA. */ #include "codegen_arm64.h" #include "arm64_lir.h" #include "art_method.h" #include "base/logging.h" #include "dex/mir_graph.h" #include "dex/quick/dex_file_to_method_inliner_map.h" #include "dex/quick/mir_to_lir-inl.h" #include "driver/compiler_driver.h" #include "driver/compiler_options.h" #include "gc/accounting/card_table.h" #include "entrypoints/quick/quick_entrypoints.h" #include "mirror/object_array-inl.h" #include "utils/dex_cache_arrays_layout-inl.h" namespace art { /* * The sparse table in the literal pool is an array of * pairs. For each set, we'll load them as a pair using ldp. * The test loop will look something like: * * adr r_base, * ldr r_val, [rA64_SP, v_reg_off] * mov r_idx, #table_size * loop: * cbz r_idx, quit * ldp r_key, r_disp, [r_base], #8 * sub r_idx, #1 * cmp r_val, r_key * b.ne loop * adr r_base, #0 ; This is the instruction from which we compute displacements * add r_base, r_disp * br r_base * quit: */ void Arm64Mir2Lir::GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) { const uint16_t* table = mir_graph_->GetTable(mir, table_offset); // Add the table to the list - we'll process it later SwitchTable *tab_rec = static_cast(arena_->Alloc(sizeof(SwitchTable), kArenaAllocData)); tab_rec->switch_mir = mir; tab_rec->table = table; tab_rec->vaddr = current_dalvik_offset_; uint32_t size = table[1]; switch_tables_.push_back(tab_rec); // Get the switch value rl_src = LoadValue(rl_src, kCoreReg); RegStorage r_base = AllocTempWide(); // Allocate key and disp temps. RegStorage r_key = AllocTemp(); RegStorage r_disp = AllocTemp(); // Materialize a pointer to the switch table NewLIR3(kA64Adr2xd, r_base.GetReg(), 0, WrapPointer(tab_rec)); // Set up r_idx RegStorage r_idx = AllocTemp(); LoadConstant(r_idx, size); // Entry of loop. LIR* loop_entry = NewLIR0(kPseudoTargetLabel); LIR* branch_out = NewLIR2(kA64Cbz2rt, r_idx.GetReg(), 0); // Load next key/disp. NewLIR4(kA64LdpPost4rrXD, r_key.GetReg(), r_disp.GetReg(), r_base.GetReg(), 2); OpRegRegImm(kOpSub, r_idx, r_idx, 1); // Go to next case, if key does not match. OpRegReg(kOpCmp, r_key, rl_src.reg); OpCondBranch(kCondNe, loop_entry); // Key does match: branch to case label. LIR* switch_label = NewLIR3(kA64Adr2xd, r_base.GetReg(), 0, -1); tab_rec->anchor = switch_label; // Add displacement to base branch address and go! OpRegRegRegExtend(kOpAdd, r_base, r_base, As64BitReg(r_disp), kA64Sxtw, 0U); NewLIR1(kA64Br1x, r_base.GetReg()); // Loop exit label. LIR* loop_exit = NewLIR0(kPseudoTargetLabel); branch_out->target = loop_exit; } void Arm64Mir2Lir::GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) { const uint16_t* table = mir_graph_->GetTable(mir, table_offset); // Add the table to the list - we'll process it later SwitchTable *tab_rec = static_cast(arena_->Alloc(sizeof(SwitchTable), kArenaAllocData)); tab_rec->switch_mir = mir; tab_rec->table = table; tab_rec->vaddr = current_dalvik_offset_; uint32_t size = table[1]; switch_tables_.push_back(tab_rec); // Get the switch value rl_src = LoadValue(rl_src, kCoreReg); RegStorage table_base = AllocTempWide(); // Materialize a pointer to the switch table NewLIR3(kA64Adr2xd, table_base.GetReg(), 0, WrapPointer(tab_rec)); int low_key = s4FromSwitchData(&table[2]); RegStorage key_reg; // Remove the bias, if necessary if (low_key == 0) { key_reg = rl_src.reg; } else { key_reg = AllocTemp(); OpRegRegImm(kOpSub, key_reg, rl_src.reg, low_key); } // Bounds check - if < 0 or >= size continue following switch OpRegImm(kOpCmp, key_reg, size - 1); LIR* branch_over = OpCondBranch(kCondHi, nullptr); // Load the displacement from the switch table RegStorage disp_reg = AllocTemp(); LoadBaseIndexed(table_base, As64BitReg(key_reg), disp_reg, 2, k32); // Get base branch address. RegStorage branch_reg = AllocTempWide(); LIR* switch_label = NewLIR3(kA64Adr2xd, branch_reg.GetReg(), 0, -1); tab_rec->anchor = switch_label; // Add displacement to base branch address and go! OpRegRegRegExtend(kOpAdd, branch_reg, branch_reg, As64BitReg(disp_reg), kA64Sxtw, 0U); NewLIR1(kA64Br1x, branch_reg.GetReg()); // branch_over target here LIR* target = NewLIR0(kPseudoTargetLabel); branch_over->target = target; } /* * Handle unlocked -> thin locked transition inline or else call out to quick entrypoint. For more * details see monitor.cc. */ void Arm64Mir2Lir::GenMonitorEnter(int opt_flags, RegLocation rl_src) { // x0/w0 = object // w1 = thin lock thread id // x2 = address of lock word // w3 = lock word / store failure // TUNING: How much performance we get when we inline this? // Since we've already flush all register. FlushAllRegs(); LoadValueDirectFixed(rl_src, rs_x0); // = TargetReg(kArg0, kRef) LockCallTemps(); // Prepare for explicit register usage LIR* null_check_branch = nullptr; if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) { null_check_branch = nullptr; // No null check. } else { // If the null-check fails its handled by the slow-path to reduce exception related meta-data. if (!cu_->compiler_driver->GetCompilerOptions().GetImplicitNullChecks()) { null_check_branch = OpCmpImmBranch(kCondEq, rs_x0, 0, nullptr); } } Load32Disp(rs_xSELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_w1); OpRegRegImm(kOpAdd, rs_x2, rs_x0, mirror::Object::MonitorOffset().Int32Value()); NewLIR2(kA64Ldxr2rX, rw3, rx2); MarkPossibleNullPointerException(opt_flags); // Zero out the read barrier bits. OpRegRegImm(kOpAnd, rs_w2, rs_w3, LockWord::kReadBarrierStateMaskShiftedToggled); LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_w2, 0, nullptr); // w3 is zero except for the rb bits here. Copy the read barrier bits into w1. OpRegRegReg(kOpOr, rs_w1, rs_w1, rs_w3); OpRegRegImm(kOpAdd, rs_x2, rs_x0, mirror::Object::MonitorOffset().Int32Value()); NewLIR3(kA64Stxr3wrX, rw3, rw1, rx2); LIR* lock_success_branch = OpCmpImmBranch(kCondEq, rs_w3, 0, nullptr); LIR* slow_path_target = NewLIR0(kPseudoTargetLabel); not_unlocked_branch->target = slow_path_target; if (null_check_branch != nullptr) { null_check_branch->target = slow_path_target; } // TODO: move to a slow path. // Go expensive route - artLockObjectFromCode(obj); LoadWordDisp(rs_xSELF, QUICK_ENTRYPOINT_OFFSET(8, pLockObject).Int32Value(), rs_xLR); ClobberCallerSave(); LIR* call_inst = OpReg(kOpBlx, rs_xLR); MarkSafepointPC(call_inst); LIR* success_target = NewLIR0(kPseudoTargetLabel); lock_success_branch->target = success_target; GenMemBarrier(kLoadAny); } /* * Handle thin locked -> unlocked transition inline or else call out to quick entrypoint. For more * details see monitor.cc. Note the code below doesn't use ldxr/stxr as the code holds the lock * and can only give away ownership if its suspended. */ void Arm64Mir2Lir::GenMonitorExit(int opt_flags, RegLocation rl_src) { // x0/w0 = object // w1 = thin lock thread id // w2 = lock word // TUNING: How much performance we get when we inline this? // Since we've already flush all register. FlushAllRegs(); LoadValueDirectFixed(rl_src, rs_x0); // Get obj LockCallTemps(); // Prepare for explicit register usage LIR* null_check_branch = nullptr; if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) { null_check_branch = nullptr; // No null check. } else { // If the null-check fails its handled by the slow-path to reduce exception related meta-data. if (!cu_->compiler_driver->GetCompilerOptions().GetImplicitNullChecks()) { null_check_branch = OpCmpImmBranch(kCondEq, rs_x0, 0, nullptr); } } Load32Disp(rs_xSELF, Thread::ThinLockIdOffset<8>().Int32Value(), rs_w1); if (!kUseReadBarrier) { Load32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_w2); } else { OpRegRegImm(kOpAdd, rs_x3, rs_x0, mirror::Object::MonitorOffset().Int32Value()); NewLIR2(kA64Ldxr2rX, rw2, rx3); } MarkPossibleNullPointerException(opt_flags); // Zero out the read barrier bits. OpRegRegImm(kOpAnd, rs_w3, rs_w2, LockWord::kReadBarrierStateMaskShiftedToggled); // Zero out except the read barrier bits. OpRegRegImm(kOpAnd, rs_w2, rs_w2, LockWord::kReadBarrierStateMaskShifted); LIR* slow_unlock_branch = OpCmpBranch(kCondNe, rs_w3, rs_w1, nullptr); GenMemBarrier(kAnyStore); LIR* unlock_success_branch; if (!kUseReadBarrier) { Store32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_w2); unlock_success_branch = OpUnconditionalBranch(nullptr); } else { OpRegRegImm(kOpAdd, rs_x3, rs_x0, mirror::Object::MonitorOffset().Int32Value()); NewLIR3(kA64Stxr3wrX, rw1, rw2, rx3); unlock_success_branch = OpCmpImmBranch(kCondEq, rs_w1, 0, nullptr); } LIR* slow_path_target = NewLIR0(kPseudoTargetLabel); slow_unlock_branch->target = slow_path_target; if (null_check_branch != nullptr) { null_check_branch->target = slow_path_target; } // TODO: move to a slow path. // Go expensive route - artUnlockObjectFromCode(obj); LoadWordDisp(rs_xSELF, QUICK_ENTRYPOINT_OFFSET(8, pUnlockObject).Int32Value(), rs_xLR); ClobberCallerSave(); LIR* call_inst = OpReg(kOpBlx, rs_xLR); MarkSafepointPC(call_inst); LIR* success_target = NewLIR0(kPseudoTargetLabel); unlock_success_branch->target = success_target; } void Arm64Mir2Lir::GenMoveException(RegLocation rl_dest) { int ex_offset = Thread::ExceptionOffset<8>().Int32Value(); RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true); LoadRefDisp(rs_xSELF, ex_offset, rl_result.reg, kNotVolatile); StoreRefDisp(rs_xSELF, ex_offset, rs_xzr, kNotVolatile); StoreValue(rl_dest, rl_result); } void Arm64Mir2Lir::UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) { RegStorage reg_card_base = AllocTempWide(); RegStorage reg_card_no = AllocTempWide(); // Needs to be wide as addr is ref=64b LoadWordDisp(rs_xSELF, Thread::CardTableOffset<8>().Int32Value(), reg_card_base); OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift); // TODO(Arm64): generate "strb wB, [xB, wC, uxtw]" rather than "strb wB, [xB, xC]"? StoreBaseIndexed(reg_card_base, reg_card_no, As32BitReg(reg_card_base), 0, kUnsignedByte); FreeTemp(reg_card_base); FreeTemp(reg_card_no); } static dwarf::Reg DwarfCoreReg(int num) { return dwarf::Reg::Arm64Core(num); } void Arm64Mir2Lir::GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method) { DCHECK_EQ(cfi_.GetCurrentCFAOffset(), 0); // empty stack. /* * On entry, x0 to x7 are live. Let the register allocation * mechanism know so it doesn't try to use any of them when * expanding the frame or flushing. * Reserve x8 & x9 for temporaries. */ LockTemp(rs_x0); LockTemp(rs_x1); LockTemp(rs_x2); LockTemp(rs_x3); LockTemp(rs_x4); LockTemp(rs_x5); LockTemp(rs_x6); LockTemp(rs_x7); LockTemp(rs_xIP0); LockTemp(rs_xIP1); /* TUNING: * Use AllocTemp() and reuse LR if possible to give us the freedom on adjusting the number * of temp registers. */ /* * We can safely skip the stack overflow check if we're * a leaf *and* our frame size < fudge factor. */ bool skip_overflow_check = mir_graph_->MethodIsLeaf() && !FrameNeedsStackCheck(frame_size_, kArm64); const size_t kStackOverflowReservedUsableBytes = GetStackOverflowReservedBytes(kArm64); const bool large_frame = static_cast(frame_size_) > kStackOverflowReservedUsableBytes; bool generate_explicit_stack_overflow_check = large_frame || !cu_->compiler_driver->GetCompilerOptions().GetImplicitStackOverflowChecks(); const int spill_count = num_core_spills_ + num_fp_spills_; const int spill_size = (spill_count * kArm64PointerSize + 15) & ~0xf; // SP 16 byte alignment. const int frame_size_without_spills = frame_size_ - spill_size; if (!skip_overflow_check) { if (generate_explicit_stack_overflow_check) { // Load stack limit LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1); } else { // Implicit stack overflow check. // Generate a load from [sp, #-framesize]. If this is in the stack // redzone we will get a segmentation fault. // TODO: If the frame size is small enough, is it possible to make this a pre-indexed load, // so that we can avoid the following "sub sp" when spilling? OpRegRegImm(kOpSub, rs_x8, rs_sp, GetStackOverflowReservedBytes(kArm64)); Load32Disp(rs_x8, 0, rs_wzr); MarkPossibleStackOverflowException(); } } int spilled_already = 0; if (spill_size > 0) { spilled_already = SpillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_); DCHECK(spill_size == spilled_already || frame_size_ == spilled_already); } if (spilled_already != frame_size_) { OpRegImm(kOpSub, rs_sp, frame_size_without_spills); cfi_.AdjustCFAOffset(frame_size_without_spills); } if (!skip_overflow_check) { if (generate_explicit_stack_overflow_check) { class StackOverflowSlowPath: public LIRSlowPath { public: StackOverflowSlowPath(Mir2Lir* m2l, LIR* branch, size_t sp_displace) : LIRSlowPath(m2l, branch), sp_displace_(sp_displace) { } void Compile() OVERRIDE { m2l_->ResetRegPool(); m2l_->ResetDefTracking(); GenerateTargetLabel(kPseudoThrowTarget); // Unwinds stack. m2l_->OpRegImm(kOpAdd, rs_sp, sp_displace_); m2l_->cfi().AdjustCFAOffset(-sp_displace_); m2l_->ClobberCallerSave(); ThreadOffset<8> func_offset = QUICK_ENTRYPOINT_OFFSET(8, pThrowStackOverflow); m2l_->LockTemp(rs_xIP0); m2l_->LoadWordDisp(rs_xSELF, func_offset.Int32Value(), rs_xIP0); m2l_->NewLIR1(kA64Br1x, rs_xIP0.GetReg()); m2l_->FreeTemp(rs_xIP0); m2l_->cfi().AdjustCFAOffset(sp_displace_); } private: const size_t sp_displace_; }; LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr); AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_)); } } FlushIns(ArgLocs, rl_method); FreeTemp(rs_x0); FreeTemp(rs_x1); FreeTemp(rs_x2); FreeTemp(rs_x3); FreeTemp(rs_x4); FreeTemp(rs_x5); FreeTemp(rs_x6); FreeTemp(rs_x7); FreeTemp(rs_xIP0); FreeTemp(rs_xIP1); } void Arm64Mir2Lir::GenExitSequence() { cfi_.RememberState(); /* * In the exit path, r0/r1 are live - make sure they aren't * allocated by the register utilities as temps. */ LockTemp(rs_x0); LockTemp(rs_x1); UnspillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_); // Finally return. NewLIR0(kA64Ret); // The CFI should be restored for any code that follows the exit block. cfi_.RestoreState(); cfi_.DefCFAOffset(frame_size_); } void Arm64Mir2Lir::GenSpecialExitSequence() { NewLIR0(kA64Ret); } void Arm64Mir2Lir::GenSpecialEntryForSuspend() { // Keep 16-byte stack alignment - push x0, i.e. ArtMethod*, lr. core_spill_mask_ = (1u << rs_xLR.GetRegNum()); num_core_spills_ = 1u; fp_spill_mask_ = 0u; num_fp_spills_ = 0u; frame_size_ = 16u; core_vmap_table_.clear(); fp_vmap_table_.clear(); NewLIR4(WIDE(kA64StpPre4rrXD), rs_x0.GetReg(), rs_xLR.GetReg(), rs_sp.GetReg(), -frame_size_ / 8); cfi_.AdjustCFAOffset(frame_size_); // Do not generate CFI for scratch register x0. cfi_.RelOffset(DwarfCoreReg(rxLR), 8); } void Arm64Mir2Lir::GenSpecialExitForSuspend() { // Pop the frame. (ArtMethod* no longer needed but restore it anyway.) NewLIR4(WIDE(kA64LdpPost4rrXD), rs_x0.GetReg(), rs_xLR.GetReg(), rs_sp.GetReg(), frame_size_ / 8); cfi_.AdjustCFAOffset(-frame_size_); cfi_.Restore(DwarfCoreReg(rxLR)); } static bool Arm64UseRelativeCall(CompilationUnit* cu, const MethodReference& target_method) { // Emit relative calls anywhere in the image or within a dex file otherwise. return cu->compiler_driver->IsImage() || cu->dex_file == target_method.dex_file; } /* * Bit of a hack here - in the absence of a real scheduling pass, * emit the next instruction in static & direct invoke sequences. */ int Arm64Mir2Lir::Arm64NextSDCallInsn(CompilationUnit* cu, CallInfo* info, int state, const MethodReference& target_method, uint32_t unused_idx ATTRIBUTE_UNUSED, uintptr_t direct_code, uintptr_t direct_method, InvokeType type) { Arm64Mir2Lir* cg = static_cast(cu->cg.get()); if (info->string_init_offset != 0) { RegStorage arg0_ref = cg->TargetReg(kArg0, kRef); switch (state) { case 0: { // Grab target method* from thread pointer cg->LoadWordDisp(rs_xSELF, info->string_init_offset, arg0_ref); break; } case 1: // Grab the code from the method* if (direct_code == 0) { // kInvokeTgt := arg0_ref->entrypoint cg->LoadWordDisp(arg0_ref, ArtMethod::EntryPointFromQuickCompiledCodeOffset( kArm64PointerSize).Int32Value(), cg->TargetPtrReg(kInvokeTgt)); } break; default: return -1; } } else if (direct_code != 0 && direct_method != 0) { switch (state) { case 0: // Get the current Method* [sets kArg0] if (direct_code != static_cast(-1)) { cg->LoadConstantWide(cg->TargetPtrReg(kInvokeTgt), direct_code); } else if (Arm64UseRelativeCall(cu, target_method)) { // Defer to linker patch. } else { cg->LoadCodeAddress(target_method, type, kInvokeTgt); } if (direct_method != static_cast(-1)) { cg->LoadConstantWide(cg->TargetReg(kArg0, kRef), direct_method); } else { cg->LoadMethodAddress(target_method, type, kArg0); } break; default: return -1; } } else { bool use_pc_rel = cg->CanUseOpPcRelDexCacheArrayLoad(); RegStorage arg0_ref = cg->TargetPtrReg(kArg0); switch (state) { case 0: // Get the current Method* [sets kArg0] // TUNING: we can save a reg copy if Method* has been promoted. if (!use_pc_rel) { cg->LoadCurrMethodDirect(arg0_ref); break; } ++state; FALLTHROUGH_INTENDED; case 1: // Get method->dex_cache_resolved_methods_ if (!use_pc_rel) { cg->LoadRefDisp(arg0_ref, ArtMethod::DexCacheResolvedMethodsOffset().Int32Value(), arg0_ref, kNotVolatile); } // Set up direct code if known. if (direct_code != 0) { if (direct_code != static_cast(-1)) { cg->LoadConstantWide(cg->TargetPtrReg(kInvokeTgt), direct_code); } else if (Arm64UseRelativeCall(cu, target_method)) { // Defer to linker patch. } else { CHECK_LT(target_method.dex_method_index, target_method.dex_file->NumMethodIds()); cg->LoadCodeAddress(target_method, type, kInvokeTgt); } } if (!use_pc_rel || direct_code != 0) { break; } ++state; FALLTHROUGH_INTENDED; case 2: // Grab target method* CHECK_EQ(cu->dex_file, target_method.dex_file); if (!use_pc_rel) { cg->LoadWordDisp(arg0_ref, mirror::Array::DataOffset(kArm64PointerSize).Uint32Value() + target_method.dex_method_index * kArm64PointerSize, arg0_ref); } else { size_t offset = cg->dex_cache_arrays_layout_.MethodOffset(target_method.dex_method_index); cg->OpPcRelDexCacheArrayLoad(cu->dex_file, offset, arg0_ref, true); } break; case 3: // Grab the code from the method* if (direct_code == 0) { // kInvokeTgt := arg0_ref->entrypoint cg->LoadWordDisp(arg0_ref, ArtMethod::EntryPointFromQuickCompiledCodeOffset( kArm64PointerSize).Int32Value(), cg->TargetPtrReg(kInvokeTgt)); } break; default: return -1; } } return state + 1; } NextCallInsn Arm64Mir2Lir::GetNextSDCallInsn() { return Arm64NextSDCallInsn; } LIR* Arm64Mir2Lir::CallWithLinkerFixup(const MethodReference& target_method, InvokeType type) { // For ARM64, just generate a relative BL instruction that will be filled in at 'link time'. // If the target turns out to be too far, the linker will generate a thunk for dispatch. int target_method_idx = target_method.dex_method_index; const DexFile* target_dex_file = target_method.dex_file; // Generate the call instruction and save index, dex_file, and type. // NOTE: Method deduplication takes linker patches into account, so we can just pass 0 // as a placeholder for the offset. LIR* call = RawLIR(current_dalvik_offset_, kA64Bl1t, 0, target_method_idx, WrapPointer(target_dex_file), type); AppendLIR(call); call_method_insns_.push_back(call); return call; } LIR* Arm64Mir2Lir::GenCallInsn(const MirMethodLoweringInfo& method_info) { LIR* call_insn; if (method_info.FastPath() && Arm64UseRelativeCall(cu_, method_info.GetTargetMethod()) && (method_info.GetSharpType() == kDirect || method_info.GetSharpType() == kStatic) && method_info.DirectCode() == static_cast(-1)) { call_insn = CallWithLinkerFixup(method_info.GetTargetMethod(), method_info.GetSharpType()); } else { call_insn = OpReg(kOpBlx, TargetPtrReg(kInvokeTgt)); } return call_insn; } } // namespace art