1 /*
2  * Copyright (C) 2023 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_
18 #define BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_
19 
20 #include "berberis/backend/x86_64/machine_ir.h"
21 #include "berberis/backend/x86_64/machine_ir_builder.h"
22 #include "berberis/base/arena_map.h"
23 #include "berberis/base/checks.h"
24 #include "berberis/base/dependent_false.h"
25 #include "berberis/decoder/riscv64/decoder.h"
26 #include "berberis/decoder/riscv64/semantics_player.h"
27 #include "berberis/guest_state/guest_addr.h"
28 #include "berberis/guest_state/guest_state_arch.h"
29 #include "berberis/guest_state/guest_state_opaque.h"
30 #include "berberis/intrinsics/intrinsics.h"
31 #include "berberis/intrinsics/macro_assembler.h"
32 #include "berberis/runtime_primitives/memory_region_reservation.h"
33 #include "berberis/runtime_primitives/platform.h"
34 
35 #include "call_intrinsic.h"
36 #include "inline_intrinsic.h"
37 #include "simd_register.h"
38 
39 namespace berberis {
40 
41 class HeavyOptimizerFrontend {
42  public:
43   using CsrName = berberis::CsrName;
44   using Decoder = Decoder<SemanticsPlayer<HeavyOptimizerFrontend>>;
45   using Register = MachineReg;
46   using FpRegister = SimdReg;
47   using Float32 = intrinsics::Float32;
48   using Float64 = intrinsics::Float64;
49 
50   struct MemoryOperand {
51     Register base{0};
52     // We call the following field "index" even though we do not scale it at the
53     // moment.  We can add a scale as the need arises.
54     Register index{0};
55     uint64_t disp = 0;
56   };
57 
HeavyOptimizerFrontend(x86_64::MachineIR * machine_ir,GuestAddr pc)58   explicit HeavyOptimizerFrontend(x86_64::MachineIR* machine_ir, GuestAddr pc)
59       : pc_(pc),
60         success_(true),
61         builder_(machine_ir),
62         flag_register_(machine_ir->AllocVReg()),
63         is_uncond_branch_(false),
64         branch_targets_(machine_ir->arena()) {
65     StartRegion();
66   }
67 
68   void CompareAndBranch(Decoder::BranchOpcode opcode, Register arg1, Register arg2, int16_t offset);
69   void Branch(int32_t offset);
70   void BranchRegister(Register base, int16_t offset);
71 
72   [[nodiscard]] Register GetImm(uint64_t imm);
Copy(Register value)73   [[nodiscard]] Register Copy(Register value) {
74     Register result = AllocTempReg();
75     Gen<PseudoCopy>(result, value, 8);
76     return result;
77   }
78 
79   [[nodiscard]] Register GetReg(uint8_t reg);
80   void SetReg(uint8_t reg, Register value);
81 
82   void Undefined();
83   //
84   // Instruction implementations.
85   //
86   void Nop();
87   Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2);
88   Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2);
89   Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm);
90   Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm);
91   Register Slli(Register arg, int8_t imm);
92   Register Srli(Register arg, int8_t imm);
93   Register Srai(Register arg, int8_t imm);
94   Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm);
95   Register Rori(Register arg, int8_t shamt);
96   Register Roriw(Register arg, int8_t shamt);
97   Register Lui(int32_t imm);
98   Register Auipc(int32_t imm);
99 
Ecall(Register,Register,Register,Register,Register,Register,Register)100   Register Ecall(Register /* syscall_nr */,
101                  Register /* arg0 */,
102                  Register /* arg1 */,
103                  Register /* arg2 */,
104                  Register /* arg3 */,
105                  Register /* arg4 */,
106                  Register /* arg5 */) {
107     Undefined();
108     return {};
109   }
110 
111   void Store(Decoder::MemoryDataOperandType operand_type,
112              Register arg,
113              int16_t offset,
114              Register data);
115   Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset);
116 
117   template <typename IntType>
ToLoadOperandType()118   constexpr Decoder::LoadOperandType ToLoadOperandType() {
119     if constexpr (std::is_same_v<IntType, int8_t>) {
120       return Decoder::LoadOperandType::k8bitSigned;
121     } else if constexpr (std::is_same_v<IntType, int16_t>) {
122       return Decoder::LoadOperandType::k16bitSigned;
123     } else if constexpr (std::is_same_v<IntType, int32_t>) {
124       return Decoder::LoadOperandType::k32bitSigned;
125     } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) {
126       return Decoder::LoadOperandType::k64bit;
127     } else if constexpr (std::is_same_v<IntType, uint8_t>) {
128       return Decoder::LoadOperandType::k8bitUnsigned;
129     } else if constexpr (std::is_same_v<IntType, uint16_t>) {
130       return Decoder::LoadOperandType::k16bitUnsigned;
131     } else if constexpr (std::is_same_v<IntType, uint32_t>) {
132       return Decoder::LoadOperandType::k32bitUnsigned;
133     } else {
134       static_assert(kDependentTypeFalse<IntType>);
135     }
136   }
137 
138   template <typename IntType>
ToMemoryDataOperandType()139   constexpr Decoder::MemoryDataOperandType ToMemoryDataOperandType() {
140     if constexpr (std::is_same_v<IntType, int8_t> || std::is_same_v<IntType, uint8_t>) {
141       return Decoder::MemoryDataOperandType::k8bit;
142     } else if constexpr (std::is_same_v<IntType, int16_t> || std::is_same_v<IntType, uint16_t>) {
143       return Decoder::MemoryDataOperandType::k16bit;
144     } else if constexpr (std::is_same_v<IntType, int32_t> || std::is_same_v<IntType, uint32_t>) {
145       return Decoder::MemoryDataOperandType::k32bit;
146     } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) {
147       return Decoder::MemoryDataOperandType::k64bit;
148     } else {
149       static_assert(kDependentTypeFalse<IntType>);
150     }
151   }
152 
153   // Versions without recovery can be used to access non-guest memory (e.g. CPUState).
154   Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type, Register base, int32_t disp);
155   Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type,
156                                Register base,
157                                Register index,
158                                int32_t disp);
159   void StoreWithoutRecovery(Decoder::MemoryDataOperandType operand_type,
160                             Register base,
161                             int32_t disp,
162                             Register val);
163   void StoreWithoutRecovery(Decoder::MemoryDataOperandType operand_type,
164                             Register base,
165                             Register index,
166                             int32_t disp,
167                             Register val);
168 
169   //
170   // Atomic extensions.
171   //
172 
173   template <typename IntType, bool aq, bool rl>
Lr(Register addr)174   Register Lr(Register addr) {
175     Register aligned_addr = AllocTempReg();
176     Gen<PseudoCopy>(aligned_addr, addr, 8);
177     // The immediate is sign extended to 64-bit.
178     Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{sizeof(Reservation) - 1}, GetFlagsRegister());
179 
180     MemoryRegionReservationLoad(aligned_addr);
181 
182     Register addr_offset = AllocTempReg();
183     Gen<PseudoCopy>(addr_offset, addr, 8);
184     Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister());
185 
186     // Load the requested part from CPUState.
187     return LoadWithoutRecovery(ToLoadOperandType<IntType>(),
188                                x86_64::kMachineRegRBP,
189                                addr_offset,
190                                GetThreadStateReservationValueOffset());
191   }
192 
193   template <typename IntType, bool aq, bool rl>
Sc(Register addr,Register data)194   Register Sc(Register addr, Register data) {
195     // Compute aligned_addr.
196     auto aligned_addr = AllocTempReg();
197     Gen<PseudoCopy>(aligned_addr, addr, 8);
198     // The immediate is sign extended to 64-bit.
199     Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{sizeof(Reservation) - 1}, GetFlagsRegister());
200 
201     // Load current monitor value before we clobber it.
202     auto reservation_value = AllocTempReg();
203     int32_t value_offset = GetThreadStateReservationValueOffset();
204     Gen<x86_64::MovqRegMemBaseDisp>(reservation_value, x86_64::kMachineRegRBP, value_offset);
205     Register addr_offset = AllocTempReg();
206     Gen<PseudoCopy>(addr_offset, addr, 8);
207     Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister());
208     // It's okay to clobber reservation_value since we clear out reservation_address in
209     // MemoryRegionReservationExchange anyway.
210     StoreWithoutRecovery(ToMemoryDataOperandType<IntType>(),
211                          x86_64::kMachineRegRBP,
212                          addr_offset,
213                          value_offset,
214                          data);
215 
216     return MemoryRegionReservationExchange(aligned_addr, reservation_value);
217   }
218 
219   void Fence(Decoder::FenceOpcode opcode,
220              Register src,
221              bool sw,
222              bool sr,
223              bool so,
224              bool si,
225              bool pw,
226              bool pr,
227              bool po,
228              bool pi);
229 
230   //
231   // F and D extensions.
232   //
233   [[nodiscard]] FpRegister GetFpReg(uint8_t reg);
234 
235   template <typename FloatType>
GetFRegAndUnboxNan(uint8_t reg)236   [[nodiscard]] FpRegister GetFRegAndUnboxNan(uint8_t reg) {
237     CHECK_LE(reg, kNumGuestFpRegs);
238     FpRegister result = AllocTempSimdReg();
239     builder_.GenGetSimd<8>(result.machine_reg(), GetThreadStateFRegOffset(reg));
240     FpRegister unboxed_result = AllocTempSimdReg();
241     if (host_platform::kHasAVX) {
242       builder_.Gen<x86_64::MacroUnboxNanFloat32AVX>(unboxed_result.machine_reg(),
243                                                     result.machine_reg());
244     } else {
245       builder_.Gen<x86_64::MacroUnboxNanFloat32>(unboxed_result.machine_reg(),
246                                                  result.machine_reg());
247     }
248     return unboxed_result;
249   }
250 
251   template <typename FloatType>
NanBoxFpReg(FpRegister value)252   void NanBoxFpReg(FpRegister value) {
253     if (host_platform::kHasAVX) {
254       builder_.Gen<x86_64::MacroNanBoxFloat32AVX>(value.machine_reg(), value.machine_reg());
255     } else {
256       builder_.Gen<x86_64::MacroNanBoxFloat32>(value.machine_reg());
257     }
258   }
259 
260   template <typename FloatType>
NanBoxAndSetFpReg(uint8_t reg,FpRegister value)261   void NanBoxAndSetFpReg(uint8_t reg, FpRegister value) {
262     CHECK_LE(reg, kNumGuestFpRegs);
263     if (success()) {
264       NanBoxFpReg<FloatType>(value);
265       builder_.GenSetSimd<8>(GetThreadStateFRegOffset(reg), value.machine_reg());
266     }
267   }
268 
269   template <typename DataType>
LoadFp(Register arg,int16_t offset)270   FpRegister LoadFp(Register arg, int16_t offset) {
271     auto res = AllocTempSimdReg();
272     if constexpr (std::is_same_v<DataType, Float32>) {
273       Gen<x86_64::MovssXRegMemBaseDisp>(res.machine_reg(), arg, offset);
274     } else if constexpr (std::is_same_v<DataType, Float64>) {
275       Gen<x86_64::MovsdXRegMemBaseDisp>(res.machine_reg(), arg, offset);
276     } else {
277       static_assert(kDependentTypeFalse<DataType>);
278     }
279     return res;
280   }
281 
282   template <typename DataType>
StoreFp(Register arg,int16_t offset,FpRegister data)283   void StoreFp(Register arg, int16_t offset, FpRegister data) {
284     if constexpr (std::is_same_v<DataType, Float32>) {
285       Gen<x86_64::MovssMemBaseDispXReg>(arg, offset, data.machine_reg());
286     } else if constexpr (std::is_same_v<DataType, Float64>) {
287       Gen<x86_64::MovsdMemBaseDispXReg>(arg, offset, data.machine_reg());
288     } else {
289       static_assert(kDependentTypeFalse<DataType>);
290     }
291   }
292 
Fmv(FpRegister arg)293   FpRegister Fmv(FpRegister arg) {
294     auto res = AllocTempSimdReg();
295     Gen<PseudoCopy>(res.machine_reg(), arg.machine_reg(), 16);
296     return res;
297   }
298 
299   //
300   // V extension.
301   //
302 
303   template <typename VOpArgs, typename... ExtraAegs>
OpVector(const VOpArgs &,ExtraAegs...)304   void OpVector(const VOpArgs& /*args*/, ExtraAegs... /*extra_args*/) {
305     // TODO(b/300690740): develop and implement strategy which would allow us to support vector
306     // intrinsics not just in the interpreter.
307     Undefined();
308   }
309 
310   //
311   // Csr
312   //
313 
314   Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr);
315   Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr);
316 
success()317   [[nodiscard]] bool success() const { return success_; }
318 
319   //
320   // Intrinsic proxy methods.
321   //
322 
323 #include "berberis/intrinsics/translator_intrinsics_hooks-inl.h"
324 
325   //
326   // Guest state getters/setters.
327   //
328 
GetInsnAddr()329   [[nodiscard]] GuestAddr GetInsnAddr() const { return pc_; }
IncrementInsnAddr(uint8_t insn_size)330   void IncrementInsnAddr(uint8_t insn_size) { pc_ += insn_size; }
331 
332   [[nodiscard]] bool IsRegionEndReached() const;
333   void StartInsn();
334   void Finalize(GuestAddr stop_pc);
335 
336   // These methods are exported only for testing.
branch_targets()337   [[nodiscard]] const ArenaMap<GuestAddr, MachineInsnPosition>& branch_targets() const {
338     return branch_targets_;
339   }
340 
341   template <CsrName kName>
GetCsr()342   [[nodiscard]] Register GetCsr() {
343     auto csr_reg = AllocTempReg();
344     if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) {
345       Gen<x86_64::MovzxblRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>);
346     } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) {
347       Gen<x86_64::MovqRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>);
348     } else {
349       static_assert(kDependentTypeFalse<CsrFieldType<kName>>);
350     }
351     return csr_reg;
352   }
353 
354   template <CsrName kName>
SetCsr(uint8_t imm)355   void SetCsr(uint8_t imm) {
356     // Note: csr immediate only have 5 bits in RISC-V encoding which guarantess us that
357     // “imm & kCsrMask<kName>”can be used as 8-bit immediate.
358     if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) {
359       Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP,
360                                       kCsrFieldOffset<kName>,
361                                       static_cast<int8_t>(imm & kCsrMask<kName>));
362     } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) {
363       Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP,
364                                       kCsrFieldOffset<kName>,
365                                       static_cast<int8_t>(imm & kCsrMask<kName>));
366     } else {
367       static_assert(kDependentTypeFalse<CsrFieldType<kName>>);
368     }
369   }
370 
371   template <CsrName kName>
SetCsr(Register arg)372   void SetCsr(Register arg) {
373     auto tmp = AllocTempReg();
374     Gen<PseudoCopy>(tmp, arg, sizeof(CsrFieldType<kName>));
375     if constexpr (sizeof(CsrFieldType<kName>) == 1) {
376       Gen<x86_64::AndbRegImm>(tmp, kCsrMask<kName>, GetFlagsRegister());
377       Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp);
378     } else if constexpr (sizeof(CsrFieldType<kName>) == 8) {
379       Gen<x86_64::AndqRegMemAbsolute>(
380           tmp, constants_pool::kConst<uint64_t{kCsrMask<kName>}>, GetFlagsRegister());
381       Gen<x86_64::MovqMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp);
382     } else {
383       static_assert(kDependentTypeFalse<CsrFieldType<kName>>);
384     }
385   }
386 
387  private:
388   // Specialization for AssemblerResType=void
389   template <auto kFunction,
390             typename AssemblerResType,
391             typename... AssemblerArgType,
392             std::enable_if_t<std::is_same_v<std::decay_t<AssemblerResType>, void>, bool> = true>
CallIntrinsic(AssemblerArgType...args)393   void CallIntrinsic(AssemblerArgType... args) {
394     if (TryInlineIntrinsicForHeavyOptimizerVoid<kFunction>(
395             &builder_, GetFlagsRegister(), args...)) {
396       return;
397     }
398 
399     CallIntrinsicImpl(&builder_, kFunction, GetFlagsRegister(), args...);
400   }
401 
402   template <auto kFunction,
403             typename AssemblerResType,
404             typename... AssemblerArgType,
405             std::enable_if_t<!std::is_same_v<std::decay_t<AssemblerResType>, void>, bool> = true>
CallIntrinsic(AssemblerArgType...args)406   AssemblerResType CallIntrinsic(AssemblerArgType... args) {
407     AssemblerResType result;
408 
409     if constexpr (std::is_same_v<AssemblerResType, Register>) {
410       result = AllocTempReg();
411     } else if constexpr (std::is_same_v<AssemblerResType, SimdReg>) {
412       result = AllocTempSimdReg();
413     } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<Register, Register>>) {
414       result = {AllocTempReg(), AllocTempReg()};
415     } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, Register>>) {
416       result = {AllocTempSimdReg(), AllocTempReg()};
417     } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, SimdReg>>) {
418       result = {AllocTempSimdReg(), AllocTempSimdReg()};
419     } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, SimdReg, SimdReg>>) {
420       result = {AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg()};
421     } else if constexpr (std::is_same_v<AssemblerResType,
422                                         std::tuple<SimdReg, SimdReg, SimdReg, SimdReg>>) {
423       result = {AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg()};
424     } else {
425       // This should not be reached by the compiler. If it is - there is a new result type that
426       // needs to be supported.
427       static_assert(kDependentTypeFalse<AssemblerResType>, "Unsupported result type");
428     }
429 
430     if (TryInlineIntrinsicForHeavyOptimizer<kFunction>(
431             &builder_, result, GetFlagsRegister(), args...)) {
432       return result;
433     }
434 
435     CallIntrinsicImpl(&builder_, kFunction, result, GetFlagsRegister(), args...);
436     return result;
437   }
438 
439   void MemoryRegionReservationLoad(Register aligned_addr);
440   Register MemoryRegionReservationExchange(Register aligned_addr, Register curr_reservation_value);
441   void MemoryRegionReservationSwapWithLockedOwner(Register aligned_addr,
442                                                   Register curr_reservation_value,
443                                                   Register new_reservation_value,
444                                                   MachineBasicBlock* failure_bb);
445 
446   // Syntax sugar.
447   template <typename InsnType, typename... Args>
Gen(Args...args)448   /*may_discard*/ InsnType* Gen(Args... args) {
449     return builder_.Gen<InsnType, Args...>(args...);
450   }
451 
452   static x86_64::Assembler::Condition ToAssemblerCond(Decoder::BranchOpcode opcode);
453 
454   [[nodiscard]] Register AllocTempReg();
455   [[nodiscard]] SimdReg AllocTempSimdReg();
GetFlagsRegister()456   [[nodiscard]] Register GetFlagsRegister() const { return flag_register_; };
457 
458   void GenJump(GuestAddr target);
459   void ExitGeneratedCode(GuestAddr target);
460   void ExitRegionIndirect(Register target);
461 
462   void GenRecoveryBlockForLastInsn();
463 
464   void ResolveJumps();
465   void ReplaceJumpWithBranch(MachineBasicBlock* bb, MachineBasicBlock* target_bb);
466   void UpdateBranchTargetsAfterSplit(GuestAddr addr,
467                                      const MachineBasicBlock* old_bb,
468                                      MachineBasicBlock* new_bb);
469 
StartRegion()470   void StartRegion() {
471     auto* region_entry_bb = builder_.ir()->NewBasicBlock();
472     auto* cont_bb = builder_.ir()->NewBasicBlock();
473     builder_.ir()->AddEdge(region_entry_bb, cont_bb);
474     builder_.StartBasicBlock(region_entry_bb);
475     Gen<PseudoBranch>(cont_bb);
476     builder_.StartBasicBlock(cont_bb);
477   }
478 
479   GuestAddr pc_;
480   bool success_;
481   x86_64::MachineIRBuilder builder_;
482   MachineReg flag_register_;
483   bool is_uncond_branch_;
484   // Contains IR positions of all guest instructions of the current region.
485   // Also contains all branch targets which the current region jumps to.
486   // If the target is outside of the current region the position is uninitialized,
487   // i.e. it's basic block (position.first) is nullptr.
488   ArenaMap<GuestAddr, MachineInsnPosition> branch_targets_;
489 };
490 
491 template <>
492 [[nodiscard]] inline HeavyOptimizerFrontend::FpRegister
493 HeavyOptimizerFrontend::GetFRegAndUnboxNan<intrinsics::Float64>(uint8_t reg) {
494   return GetFpReg(reg);
495 }
496 
497 template <>
498 inline void HeavyOptimizerFrontend::NanBoxFpReg<intrinsics::Float64>(FpRegister) {}
499 
500 template <>
501 [[nodiscard]] inline HeavyOptimizerFrontend::Register
502 HeavyOptimizerFrontend::GetCsr<CsrName::kCycle>() {
503   return CPUClockCount();
504 }
505 
506 template <>
507 [[nodiscard]] inline HeavyOptimizerFrontend::Register
508 HeavyOptimizerFrontend::GetCsr<CsrName::kFCsr>() {
509   auto csr_reg = AllocTempReg();
510   auto tmp = AllocTempReg();
511   InlineIntrinsicForHeavyOptimizer<&intrinsics::FeGetExceptions>(
512       &builder_, tmp, GetFlagsRegister());
513   Gen<x86_64::MovzxbqRegMemBaseDisp>(
514       csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>);
515   Gen<x86_64::ShlbRegImm>(csr_reg, 5, GetFlagsRegister());
516   Gen<x86_64::OrbRegReg>(csr_reg, tmp, GetFlagsRegister());
517   return csr_reg;
518 }
519 
520 template <>
521 [[nodiscard]] inline HeavyOptimizerFrontend::Register
522 HeavyOptimizerFrontend::GetCsr<CsrName::kFFlags>() {
523   return FeGetExceptions();
524 }
525 
526 template <>
527 [[nodiscard]] inline HeavyOptimizerFrontend::Register
528 HeavyOptimizerFrontend::GetCsr<CsrName::kVlenb>() {
529   return GetImm(16);
530 }
531 
532 template <>
533 [[nodiscard]] inline HeavyOptimizerFrontend::Register
534 HeavyOptimizerFrontend::GetCsr<CsrName::kVxrm>() {
535   auto reg = AllocTempReg();
536   Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>);
537   Gen<x86_64::AndbRegImm>(reg, 0b11, GetFlagsRegister());
538   return reg;
539 }
540 
541 template <>
542 [[nodiscard]] inline HeavyOptimizerFrontend::Register
543 HeavyOptimizerFrontend::GetCsr<CsrName::kVxsat>() {
544   auto reg = AllocTempReg();
545   Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>);
546   Gen<x86_64::ShrbRegImm>(reg, 2, GetFlagsRegister());
547   return reg;
548 }
549 
550 template <>
551 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(uint8_t imm) {
552   // Note: instructions Csrrci or Csrrsi couldn't affect Frm because immediate only has five bits.
553   // But these instruction don't pass their immediate-specified argument into `SetCsr`, they combine
554   // it with register first. Fixing that can only be done by changing code in the semantics player.
555   //
556   // But Csrrwi may clear it.  And we actually may only arrive here from Csrrwi.
557   // Thus, technically, we know that imm >> 5 is always zero, but it doesn't look like a good idea
558   // to rely on that: it's very subtle and it only affects code generation speed.
559   Gen<x86_64::MovbMemBaseDispImm>(
560       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, static_cast<int8_t>(imm >> 5));
561   InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRoundImm>(
562       &builder_, GetFlagsRegister(), imm);
563 }
564 
565 template <>
566 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(Register arg) {
567   // Check size to be sure we can use Andb and Movb below.
568   static_assert(sizeof(kCsrMask<CsrName::kFrm>) == 1);
569 
570   auto exceptions = AllocTempReg();
571   auto rounding_mode = AllocTempReg();
572   Gen<PseudoCopy>(exceptions, arg, 1);
573   Gen<x86_64::AndlRegImm>(exceptions, 0b1'1111, GetFlagsRegister());
574   // We don't care about the data in rounding_mode because we will shift in the
575   // data we need.
576   Gen<PseudoDefReg>(rounding_mode);
577   Gen<x86_64::ShldlRegRegImm>(rounding_mode, arg, int8_t{32 - 5}, GetFlagsRegister());
578   Gen<x86_64::AndbRegImm>(rounding_mode, kCsrMask<CsrName::kFrm>, GetFlagsRegister());
579   Gen<x86_64::MovbMemBaseDispReg>(
580       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, rounding_mode);
581   InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRound>(
582       &builder_, GetFlagsRegister(), exceptions, rounding_mode);
583 }
584 
585 template <>
586 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(uint8_t imm) {
587   FeSetExceptionsImm(static_cast<int8_t>(imm & 0b1'1111));
588 }
589 
590 template <>
591 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(Register arg) {
592   auto tmp = AllocTempReg();
593   Gen<PseudoCopy>(tmp, arg, 1);
594   Gen<x86_64::AndlRegImm>(tmp, 0b1'1111, GetFlagsRegister());
595   FeSetExceptions(tmp);
596 }
597 
598 template <>
599 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(uint8_t imm) {
600   Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP,
601                                   kCsrFieldOffset<CsrName::kFrm>,
602                                   static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>));
603   FeSetRoundImm(static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>));
604 }
605 
606 template <>
607 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(Register arg) {
608   // Use RCX as temporary register. We know it would be used by FeSetRound, too.
609   auto tmp = AllocTempReg();
610   Gen<PseudoCopy>(tmp, arg, 1);
611   Gen<x86_64::AndbRegImm>(tmp, kCsrMask<CsrName::kFrm>, GetFlagsRegister());
612   Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, tmp);
613   FeSetRound(tmp);
614 }
615 
616 template <>
617 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(uint8_t imm) {
618   imm &= 0b11;
619   if (imm != 0b11) {
620     Gen<x86_64::AndbMemBaseDispImm>(
621         x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister());
622   }
623   if (imm != 0b00) {
624     Gen<x86_64::OrbMemBaseDispImm>(
625         x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, imm, GetFlagsRegister());
626   }
627 }
628 
629 template <>
630 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(Register arg) {
631   Gen<x86_64::AndbMemBaseDispImm>(
632       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister());
633   Gen<x86_64::AndbRegImm>(arg, 0b11, GetFlagsRegister());
634   Gen<x86_64::OrbMemBaseDispReg>(
635       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, arg, GetFlagsRegister());
636 }
637 
638 template <>
639 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(uint8_t imm) {
640   if (imm & 0b1) {
641     Gen<x86_64::OrbMemBaseDispImm>(
642         x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister());
643   } else {
644     Gen<x86_64::AndbMemBaseDispImm>(
645         x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister());
646   }
647 }
648 
649 template <>
650 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(Register arg) {
651   using Condition = x86_64::Assembler::Condition;
652   Gen<x86_64::AndbMemBaseDispImm>(
653       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister());
654   Gen<x86_64::TestbRegImm>(arg, 1, GetFlagsRegister());
655   auto tmp = AllocTempReg();
656   Gen<x86_64::SetccReg>(Condition::kNotZero, tmp, GetFlagsRegister());
657   Gen<x86_64::MovzxbqRegReg>(tmp, tmp);
658   Gen<x86_64::ShlbRegImm>(tmp, int8_t{2}, GetFlagsRegister());
659   Gen<x86_64::OrbMemBaseDispReg>(
660       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, tmp, GetFlagsRegister());
661 }
662 
663 }  // namespace berberis
664 
665 #endif /* BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_ */
666