1 /* 2 * Copyright (C) 2023 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_ 18 #define BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_ 19 20 #include "berberis/backend/x86_64/machine_ir.h" 21 #include "berberis/backend/x86_64/machine_ir_builder.h" 22 #include "berberis/base/arena_map.h" 23 #include "berberis/base/checks.h" 24 #include "berberis/base/dependent_false.h" 25 #include "berberis/decoder/riscv64/decoder.h" 26 #include "berberis/decoder/riscv64/semantics_player.h" 27 #include "berberis/guest_state/guest_addr.h" 28 #include "berberis/guest_state/guest_state_arch.h" 29 #include "berberis/guest_state/guest_state_opaque.h" 30 #include "berberis/intrinsics/intrinsics.h" 31 #include "berberis/intrinsics/macro_assembler.h" 32 #include "berberis/runtime_primitives/memory_region_reservation.h" 33 #include "berberis/runtime_primitives/platform.h" 34 35 #include "call_intrinsic.h" 36 #include "inline_intrinsic.h" 37 #include "simd_register.h" 38 39 namespace berberis { 40 41 class HeavyOptimizerFrontend { 42 public: 43 using CsrName = berberis::CsrName; 44 using Decoder = Decoder<SemanticsPlayer<HeavyOptimizerFrontend>>; 45 using Register = MachineReg; 46 using FpRegister = SimdReg; 47 using Float32 = intrinsics::Float32; 48 using Float64 = intrinsics::Float64; 49 50 struct MemoryOperand { 51 Register base{0}; 52 // We call the following field "index" even though we do not scale it at the 53 // moment. We can add a scale as the need arises. 54 Register index{0}; 55 uint64_t disp = 0; 56 }; 57 HeavyOptimizerFrontend(x86_64::MachineIR * machine_ir,GuestAddr pc)58 explicit HeavyOptimizerFrontend(x86_64::MachineIR* machine_ir, GuestAddr pc) 59 : pc_(pc), 60 success_(true), 61 builder_(machine_ir), 62 flag_register_(machine_ir->AllocVReg()), 63 is_uncond_branch_(false), 64 branch_targets_(machine_ir->arena()) { 65 StartRegion(); 66 } 67 68 void CompareAndBranch(Decoder::BranchOpcode opcode, Register arg1, Register arg2, int16_t offset); 69 void Branch(int32_t offset); 70 void BranchRegister(Register base, int16_t offset); 71 72 [[nodiscard]] Register GetImm(uint64_t imm); Copy(Register value)73 [[nodiscard]] Register Copy(Register value) { 74 Register result = AllocTempReg(); 75 Gen<PseudoCopy>(result, value, 8); 76 return result; 77 } 78 79 [[nodiscard]] Register GetReg(uint8_t reg); 80 void SetReg(uint8_t reg, Register value); 81 82 void Undefined(); 83 // 84 // Instruction implementations. 85 // 86 void Nop(); 87 Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2); 88 Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2); 89 Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm); 90 Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm); 91 Register Slli(Register arg, int8_t imm); 92 Register Srli(Register arg, int8_t imm); 93 Register Srai(Register arg, int8_t imm); 94 Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm); 95 Register Rori(Register arg, int8_t shamt); 96 Register Roriw(Register arg, int8_t shamt); 97 Register Lui(int32_t imm); 98 Register Auipc(int32_t imm); 99 Ecall(Register,Register,Register,Register,Register,Register,Register)100 Register Ecall(Register /* syscall_nr */, 101 Register /* arg0 */, 102 Register /* arg1 */, 103 Register /* arg2 */, 104 Register /* arg3 */, 105 Register /* arg4 */, 106 Register /* arg5 */) { 107 Undefined(); 108 return {}; 109 } 110 111 void Store(Decoder::MemoryDataOperandType operand_type, 112 Register arg, 113 int16_t offset, 114 Register data); 115 Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset); 116 117 template <typename IntType> ToLoadOperandType()118 constexpr Decoder::LoadOperandType ToLoadOperandType() { 119 if constexpr (std::is_same_v<IntType, int8_t>) { 120 return Decoder::LoadOperandType::k8bitSigned; 121 } else if constexpr (std::is_same_v<IntType, int16_t>) { 122 return Decoder::LoadOperandType::k16bitSigned; 123 } else if constexpr (std::is_same_v<IntType, int32_t>) { 124 return Decoder::LoadOperandType::k32bitSigned; 125 } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) { 126 return Decoder::LoadOperandType::k64bit; 127 } else if constexpr (std::is_same_v<IntType, uint8_t>) { 128 return Decoder::LoadOperandType::k8bitUnsigned; 129 } else if constexpr (std::is_same_v<IntType, uint16_t>) { 130 return Decoder::LoadOperandType::k16bitUnsigned; 131 } else if constexpr (std::is_same_v<IntType, uint32_t>) { 132 return Decoder::LoadOperandType::k32bitUnsigned; 133 } else { 134 static_assert(kDependentTypeFalse<IntType>); 135 } 136 } 137 138 template <typename IntType> ToMemoryDataOperandType()139 constexpr Decoder::MemoryDataOperandType ToMemoryDataOperandType() { 140 if constexpr (std::is_same_v<IntType, int8_t> || std::is_same_v<IntType, uint8_t>) { 141 return Decoder::MemoryDataOperandType::k8bit; 142 } else if constexpr (std::is_same_v<IntType, int16_t> || std::is_same_v<IntType, uint16_t>) { 143 return Decoder::MemoryDataOperandType::k16bit; 144 } else if constexpr (std::is_same_v<IntType, int32_t> || std::is_same_v<IntType, uint32_t>) { 145 return Decoder::MemoryDataOperandType::k32bit; 146 } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) { 147 return Decoder::MemoryDataOperandType::k64bit; 148 } else { 149 static_assert(kDependentTypeFalse<IntType>); 150 } 151 } 152 153 // Versions without recovery can be used to access non-guest memory (e.g. CPUState). 154 Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type, Register base, int32_t disp); 155 Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type, 156 Register base, 157 Register index, 158 int32_t disp); 159 void StoreWithoutRecovery(Decoder::MemoryDataOperandType operand_type, 160 Register base, 161 int32_t disp, 162 Register val); 163 void StoreWithoutRecovery(Decoder::MemoryDataOperandType operand_type, 164 Register base, 165 Register index, 166 int32_t disp, 167 Register val); 168 169 // 170 // Atomic extensions. 171 // 172 173 template <typename IntType, bool aq, bool rl> Lr(Register addr)174 Register Lr(Register addr) { 175 Register aligned_addr = AllocTempReg(); 176 Gen<PseudoCopy>(aligned_addr, addr, 8); 177 // The immediate is sign extended to 64-bit. 178 Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{sizeof(Reservation) - 1}, GetFlagsRegister()); 179 180 MemoryRegionReservationLoad(aligned_addr); 181 182 Register addr_offset = AllocTempReg(); 183 Gen<PseudoCopy>(addr_offset, addr, 8); 184 Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister()); 185 186 // Load the requested part from CPUState. 187 return LoadWithoutRecovery(ToLoadOperandType<IntType>(), 188 x86_64::kMachineRegRBP, 189 addr_offset, 190 GetThreadStateReservationValueOffset()); 191 } 192 193 template <typename IntType, bool aq, bool rl> Sc(Register addr,Register data)194 Register Sc(Register addr, Register data) { 195 // Compute aligned_addr. 196 auto aligned_addr = AllocTempReg(); 197 Gen<PseudoCopy>(aligned_addr, addr, 8); 198 // The immediate is sign extended to 64-bit. 199 Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{sizeof(Reservation) - 1}, GetFlagsRegister()); 200 201 // Load current monitor value before we clobber it. 202 auto reservation_value = AllocTempReg(); 203 int32_t value_offset = GetThreadStateReservationValueOffset(); 204 Gen<x86_64::MovqRegMemBaseDisp>(reservation_value, x86_64::kMachineRegRBP, value_offset); 205 Register addr_offset = AllocTempReg(); 206 Gen<PseudoCopy>(addr_offset, addr, 8); 207 Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister()); 208 // It's okay to clobber reservation_value since we clear out reservation_address in 209 // MemoryRegionReservationExchange anyway. 210 StoreWithoutRecovery(ToMemoryDataOperandType<IntType>(), 211 x86_64::kMachineRegRBP, 212 addr_offset, 213 value_offset, 214 data); 215 216 return MemoryRegionReservationExchange(aligned_addr, reservation_value); 217 } 218 219 void Fence(Decoder::FenceOpcode opcode, 220 Register src, 221 bool sw, 222 bool sr, 223 bool so, 224 bool si, 225 bool pw, 226 bool pr, 227 bool po, 228 bool pi); 229 230 // 231 // F and D extensions. 232 // 233 [[nodiscard]] FpRegister GetFpReg(uint8_t reg); 234 235 template <typename FloatType> GetFRegAndUnboxNan(uint8_t reg)236 [[nodiscard]] FpRegister GetFRegAndUnboxNan(uint8_t reg) { 237 CHECK_LE(reg, kNumGuestFpRegs); 238 FpRegister result = AllocTempSimdReg(); 239 builder_.GenGetSimd<8>(result.machine_reg(), GetThreadStateFRegOffset(reg)); 240 FpRegister unboxed_result = AllocTempSimdReg(); 241 if (host_platform::kHasAVX) { 242 builder_.Gen<x86_64::MacroUnboxNanFloat32AVX>(unboxed_result.machine_reg(), 243 result.machine_reg()); 244 } else { 245 builder_.Gen<x86_64::MacroUnboxNanFloat32>(unboxed_result.machine_reg(), 246 result.machine_reg()); 247 } 248 return unboxed_result; 249 } 250 251 template <typename FloatType> NanBoxFpReg(FpRegister value)252 void NanBoxFpReg(FpRegister value) { 253 if (host_platform::kHasAVX) { 254 builder_.Gen<x86_64::MacroNanBoxFloat32AVX>(value.machine_reg(), value.machine_reg()); 255 } else { 256 builder_.Gen<x86_64::MacroNanBoxFloat32>(value.machine_reg()); 257 } 258 } 259 260 template <typename FloatType> NanBoxAndSetFpReg(uint8_t reg,FpRegister value)261 void NanBoxAndSetFpReg(uint8_t reg, FpRegister value) { 262 CHECK_LE(reg, kNumGuestFpRegs); 263 if (success()) { 264 NanBoxFpReg<FloatType>(value); 265 builder_.GenSetSimd<8>(GetThreadStateFRegOffset(reg), value.machine_reg()); 266 } 267 } 268 269 template <typename DataType> LoadFp(Register arg,int16_t offset)270 FpRegister LoadFp(Register arg, int16_t offset) { 271 auto res = AllocTempSimdReg(); 272 if constexpr (std::is_same_v<DataType, Float32>) { 273 Gen<x86_64::MovssXRegMemBaseDisp>(res.machine_reg(), arg, offset); 274 } else if constexpr (std::is_same_v<DataType, Float64>) { 275 Gen<x86_64::MovsdXRegMemBaseDisp>(res.machine_reg(), arg, offset); 276 } else { 277 static_assert(kDependentTypeFalse<DataType>); 278 } 279 return res; 280 } 281 282 template <typename DataType> StoreFp(Register arg,int16_t offset,FpRegister data)283 void StoreFp(Register arg, int16_t offset, FpRegister data) { 284 if constexpr (std::is_same_v<DataType, Float32>) { 285 Gen<x86_64::MovssMemBaseDispXReg>(arg, offset, data.machine_reg()); 286 } else if constexpr (std::is_same_v<DataType, Float64>) { 287 Gen<x86_64::MovsdMemBaseDispXReg>(arg, offset, data.machine_reg()); 288 } else { 289 static_assert(kDependentTypeFalse<DataType>); 290 } 291 } 292 Fmv(FpRegister arg)293 FpRegister Fmv(FpRegister arg) { 294 auto res = AllocTempSimdReg(); 295 Gen<PseudoCopy>(res.machine_reg(), arg.machine_reg(), 16); 296 return res; 297 } 298 299 // 300 // V extension. 301 // 302 303 template <typename VOpArgs, typename... ExtraAegs> OpVector(const VOpArgs &,ExtraAegs...)304 void OpVector(const VOpArgs& /*args*/, ExtraAegs... /*extra_args*/) { 305 // TODO(b/300690740): develop and implement strategy which would allow us to support vector 306 // intrinsics not just in the interpreter. 307 Undefined(); 308 } 309 310 // 311 // Csr 312 // 313 314 Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr); 315 Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr); 316 success()317 [[nodiscard]] bool success() const { return success_; } 318 319 // 320 // Intrinsic proxy methods. 321 // 322 323 #include "berberis/intrinsics/translator_intrinsics_hooks-inl.h" 324 325 // 326 // Guest state getters/setters. 327 // 328 GetInsnAddr()329 [[nodiscard]] GuestAddr GetInsnAddr() const { return pc_; } IncrementInsnAddr(uint8_t insn_size)330 void IncrementInsnAddr(uint8_t insn_size) { pc_ += insn_size; } 331 332 [[nodiscard]] bool IsRegionEndReached() const; 333 void StartInsn(); 334 void Finalize(GuestAddr stop_pc); 335 336 // These methods are exported only for testing. branch_targets()337 [[nodiscard]] const ArenaMap<GuestAddr, MachineInsnPosition>& branch_targets() const { 338 return branch_targets_; 339 } 340 341 template <CsrName kName> GetCsr()342 [[nodiscard]] Register GetCsr() { 343 auto csr_reg = AllocTempReg(); 344 if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) { 345 Gen<x86_64::MovzxblRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>); 346 } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) { 347 Gen<x86_64::MovqRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>); 348 } else { 349 static_assert(kDependentTypeFalse<CsrFieldType<kName>>); 350 } 351 return csr_reg; 352 } 353 354 template <CsrName kName> SetCsr(uint8_t imm)355 void SetCsr(uint8_t imm) { 356 // Note: csr immediate only have 5 bits in RISC-V encoding which guarantess us that 357 // “imm & kCsrMask<kName>”can be used as 8-bit immediate. 358 if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) { 359 Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, 360 kCsrFieldOffset<kName>, 361 static_cast<int8_t>(imm & kCsrMask<kName>)); 362 } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) { 363 Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, 364 kCsrFieldOffset<kName>, 365 static_cast<int8_t>(imm & kCsrMask<kName>)); 366 } else { 367 static_assert(kDependentTypeFalse<CsrFieldType<kName>>); 368 } 369 } 370 371 template <CsrName kName> SetCsr(Register arg)372 void SetCsr(Register arg) { 373 auto tmp = AllocTempReg(); 374 Gen<PseudoCopy>(tmp, arg, sizeof(CsrFieldType<kName>)); 375 if constexpr (sizeof(CsrFieldType<kName>) == 1) { 376 Gen<x86_64::AndbRegImm>(tmp, kCsrMask<kName>, GetFlagsRegister()); 377 Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp); 378 } else if constexpr (sizeof(CsrFieldType<kName>) == 8) { 379 Gen<x86_64::AndqRegMemAbsolute>( 380 tmp, constants_pool::kConst<uint64_t{kCsrMask<kName>}>, GetFlagsRegister()); 381 Gen<x86_64::MovqMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp); 382 } else { 383 static_assert(kDependentTypeFalse<CsrFieldType<kName>>); 384 } 385 } 386 387 private: 388 // Specialization for AssemblerResType=void 389 template <auto kFunction, 390 typename AssemblerResType, 391 typename... AssemblerArgType, 392 std::enable_if_t<std::is_same_v<std::decay_t<AssemblerResType>, void>, bool> = true> CallIntrinsic(AssemblerArgType...args)393 void CallIntrinsic(AssemblerArgType... args) { 394 if (TryInlineIntrinsicForHeavyOptimizerVoid<kFunction>( 395 &builder_, GetFlagsRegister(), args...)) { 396 return; 397 } 398 399 CallIntrinsicImpl(&builder_, kFunction, GetFlagsRegister(), args...); 400 } 401 402 template <auto kFunction, 403 typename AssemblerResType, 404 typename... AssemblerArgType, 405 std::enable_if_t<!std::is_same_v<std::decay_t<AssemblerResType>, void>, bool> = true> CallIntrinsic(AssemblerArgType...args)406 AssemblerResType CallIntrinsic(AssemblerArgType... args) { 407 AssemblerResType result; 408 409 if constexpr (std::is_same_v<AssemblerResType, Register>) { 410 result = AllocTempReg(); 411 } else if constexpr (std::is_same_v<AssemblerResType, SimdReg>) { 412 result = AllocTempSimdReg(); 413 } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<Register, Register>>) { 414 result = {AllocTempReg(), AllocTempReg()}; 415 } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, Register>>) { 416 result = {AllocTempSimdReg(), AllocTempReg()}; 417 } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, SimdReg>>) { 418 result = {AllocTempSimdReg(), AllocTempSimdReg()}; 419 } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, SimdReg, SimdReg>>) { 420 result = {AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg()}; 421 } else if constexpr (std::is_same_v<AssemblerResType, 422 std::tuple<SimdReg, SimdReg, SimdReg, SimdReg>>) { 423 result = {AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg()}; 424 } else { 425 // This should not be reached by the compiler. If it is - there is a new result type that 426 // needs to be supported. 427 static_assert(kDependentTypeFalse<AssemblerResType>, "Unsupported result type"); 428 } 429 430 if (TryInlineIntrinsicForHeavyOptimizer<kFunction>( 431 &builder_, result, GetFlagsRegister(), args...)) { 432 return result; 433 } 434 435 CallIntrinsicImpl(&builder_, kFunction, result, GetFlagsRegister(), args...); 436 return result; 437 } 438 439 void MemoryRegionReservationLoad(Register aligned_addr); 440 Register MemoryRegionReservationExchange(Register aligned_addr, Register curr_reservation_value); 441 void MemoryRegionReservationSwapWithLockedOwner(Register aligned_addr, 442 Register curr_reservation_value, 443 Register new_reservation_value, 444 MachineBasicBlock* failure_bb); 445 446 // Syntax sugar. 447 template <typename InsnType, typename... Args> Gen(Args...args)448 /*may_discard*/ InsnType* Gen(Args... args) { 449 return builder_.Gen<InsnType, Args...>(args...); 450 } 451 452 static x86_64::Assembler::Condition ToAssemblerCond(Decoder::BranchOpcode opcode); 453 454 [[nodiscard]] Register AllocTempReg(); 455 [[nodiscard]] SimdReg AllocTempSimdReg(); GetFlagsRegister()456 [[nodiscard]] Register GetFlagsRegister() const { return flag_register_; }; 457 458 void GenJump(GuestAddr target); 459 void ExitGeneratedCode(GuestAddr target); 460 void ExitRegionIndirect(Register target); 461 462 void GenRecoveryBlockForLastInsn(); 463 464 void ResolveJumps(); 465 void ReplaceJumpWithBranch(MachineBasicBlock* bb, MachineBasicBlock* target_bb); 466 void UpdateBranchTargetsAfterSplit(GuestAddr addr, 467 const MachineBasicBlock* old_bb, 468 MachineBasicBlock* new_bb); 469 StartRegion()470 void StartRegion() { 471 auto* region_entry_bb = builder_.ir()->NewBasicBlock(); 472 auto* cont_bb = builder_.ir()->NewBasicBlock(); 473 builder_.ir()->AddEdge(region_entry_bb, cont_bb); 474 builder_.StartBasicBlock(region_entry_bb); 475 Gen<PseudoBranch>(cont_bb); 476 builder_.StartBasicBlock(cont_bb); 477 } 478 479 GuestAddr pc_; 480 bool success_; 481 x86_64::MachineIRBuilder builder_; 482 MachineReg flag_register_; 483 bool is_uncond_branch_; 484 // Contains IR positions of all guest instructions of the current region. 485 // Also contains all branch targets which the current region jumps to. 486 // If the target is outside of the current region the position is uninitialized, 487 // i.e. it's basic block (position.first) is nullptr. 488 ArenaMap<GuestAddr, MachineInsnPosition> branch_targets_; 489 }; 490 491 template <> 492 [[nodiscard]] inline HeavyOptimizerFrontend::FpRegister 493 HeavyOptimizerFrontend::GetFRegAndUnboxNan<intrinsics::Float64>(uint8_t reg) { 494 return GetFpReg(reg); 495 } 496 497 template <> 498 inline void HeavyOptimizerFrontend::NanBoxFpReg<intrinsics::Float64>(FpRegister) {} 499 500 template <> 501 [[nodiscard]] inline HeavyOptimizerFrontend::Register 502 HeavyOptimizerFrontend::GetCsr<CsrName::kCycle>() { 503 return CPUClockCount(); 504 } 505 506 template <> 507 [[nodiscard]] inline HeavyOptimizerFrontend::Register 508 HeavyOptimizerFrontend::GetCsr<CsrName::kFCsr>() { 509 auto csr_reg = AllocTempReg(); 510 auto tmp = AllocTempReg(); 511 InlineIntrinsicForHeavyOptimizer<&intrinsics::FeGetExceptions>( 512 &builder_, tmp, GetFlagsRegister()); 513 Gen<x86_64::MovzxbqRegMemBaseDisp>( 514 csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>); 515 Gen<x86_64::ShlbRegImm>(csr_reg, 5, GetFlagsRegister()); 516 Gen<x86_64::OrbRegReg>(csr_reg, tmp, GetFlagsRegister()); 517 return csr_reg; 518 } 519 520 template <> 521 [[nodiscard]] inline HeavyOptimizerFrontend::Register 522 HeavyOptimizerFrontend::GetCsr<CsrName::kFFlags>() { 523 return FeGetExceptions(); 524 } 525 526 template <> 527 [[nodiscard]] inline HeavyOptimizerFrontend::Register 528 HeavyOptimizerFrontend::GetCsr<CsrName::kVlenb>() { 529 return GetImm(16); 530 } 531 532 template <> 533 [[nodiscard]] inline HeavyOptimizerFrontend::Register 534 HeavyOptimizerFrontend::GetCsr<CsrName::kVxrm>() { 535 auto reg = AllocTempReg(); 536 Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>); 537 Gen<x86_64::AndbRegImm>(reg, 0b11, GetFlagsRegister()); 538 return reg; 539 } 540 541 template <> 542 [[nodiscard]] inline HeavyOptimizerFrontend::Register 543 HeavyOptimizerFrontend::GetCsr<CsrName::kVxsat>() { 544 auto reg = AllocTempReg(); 545 Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>); 546 Gen<x86_64::ShrbRegImm>(reg, 2, GetFlagsRegister()); 547 return reg; 548 } 549 550 template <> 551 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(uint8_t imm) { 552 // Note: instructions Csrrci or Csrrsi couldn't affect Frm because immediate only has five bits. 553 // But these instruction don't pass their immediate-specified argument into `SetCsr`, they combine 554 // it with register first. Fixing that can only be done by changing code in the semantics player. 555 // 556 // But Csrrwi may clear it. And we actually may only arrive here from Csrrwi. 557 // Thus, technically, we know that imm >> 5 is always zero, but it doesn't look like a good idea 558 // to rely on that: it's very subtle and it only affects code generation speed. 559 Gen<x86_64::MovbMemBaseDispImm>( 560 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, static_cast<int8_t>(imm >> 5)); 561 InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRoundImm>( 562 &builder_, GetFlagsRegister(), imm); 563 } 564 565 template <> 566 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(Register arg) { 567 // Check size to be sure we can use Andb and Movb below. 568 static_assert(sizeof(kCsrMask<CsrName::kFrm>) == 1); 569 570 auto exceptions = AllocTempReg(); 571 auto rounding_mode = AllocTempReg(); 572 Gen<PseudoCopy>(exceptions, arg, 1); 573 Gen<x86_64::AndlRegImm>(exceptions, 0b1'1111, GetFlagsRegister()); 574 // We don't care about the data in rounding_mode because we will shift in the 575 // data we need. 576 Gen<PseudoDefReg>(rounding_mode); 577 Gen<x86_64::ShldlRegRegImm>(rounding_mode, arg, int8_t{32 - 5}, GetFlagsRegister()); 578 Gen<x86_64::AndbRegImm>(rounding_mode, kCsrMask<CsrName::kFrm>, GetFlagsRegister()); 579 Gen<x86_64::MovbMemBaseDispReg>( 580 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, rounding_mode); 581 InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRound>( 582 &builder_, GetFlagsRegister(), exceptions, rounding_mode); 583 } 584 585 template <> 586 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(uint8_t imm) { 587 FeSetExceptionsImm(static_cast<int8_t>(imm & 0b1'1111)); 588 } 589 590 template <> 591 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(Register arg) { 592 auto tmp = AllocTempReg(); 593 Gen<PseudoCopy>(tmp, arg, 1); 594 Gen<x86_64::AndlRegImm>(tmp, 0b1'1111, GetFlagsRegister()); 595 FeSetExceptions(tmp); 596 } 597 598 template <> 599 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(uint8_t imm) { 600 Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, 601 kCsrFieldOffset<CsrName::kFrm>, 602 static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>)); 603 FeSetRoundImm(static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>)); 604 } 605 606 template <> 607 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(Register arg) { 608 // Use RCX as temporary register. We know it would be used by FeSetRound, too. 609 auto tmp = AllocTempReg(); 610 Gen<PseudoCopy>(tmp, arg, 1); 611 Gen<x86_64::AndbRegImm>(tmp, kCsrMask<CsrName::kFrm>, GetFlagsRegister()); 612 Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, tmp); 613 FeSetRound(tmp); 614 } 615 616 template <> 617 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(uint8_t imm) { 618 imm &= 0b11; 619 if (imm != 0b11) { 620 Gen<x86_64::AndbMemBaseDispImm>( 621 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister()); 622 } 623 if (imm != 0b00) { 624 Gen<x86_64::OrbMemBaseDispImm>( 625 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, imm, GetFlagsRegister()); 626 } 627 } 628 629 template <> 630 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(Register arg) { 631 Gen<x86_64::AndbMemBaseDispImm>( 632 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister()); 633 Gen<x86_64::AndbRegImm>(arg, 0b11, GetFlagsRegister()); 634 Gen<x86_64::OrbMemBaseDispReg>( 635 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, arg, GetFlagsRegister()); 636 } 637 638 template <> 639 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(uint8_t imm) { 640 if (imm & 0b1) { 641 Gen<x86_64::OrbMemBaseDispImm>( 642 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister()); 643 } else { 644 Gen<x86_64::AndbMemBaseDispImm>( 645 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister()); 646 } 647 } 648 649 template <> 650 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(Register arg) { 651 using Condition = x86_64::Assembler::Condition; 652 Gen<x86_64::AndbMemBaseDispImm>( 653 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister()); 654 Gen<x86_64::TestbRegImm>(arg, 1, GetFlagsRegister()); 655 auto tmp = AllocTempReg(); 656 Gen<x86_64::SetccReg>(Condition::kNotZero, tmp, GetFlagsRegister()); 657 Gen<x86_64::MovzxbqRegReg>(tmp, tmp); 658 Gen<x86_64::ShlbRegImm>(tmp, int8_t{2}, GetFlagsRegister()); 659 Gen<x86_64::OrbMemBaseDispReg>( 660 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, tmp, GetFlagsRegister()); 661 } 662 663 } // namespace berberis 664 665 #endif /* BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_ */ 666