1 /*
2  * Copyright (C) 2023 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file excenaupt in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "berberis/interpreter/riscv64/interpreter.h"
18 
19 #include <atomic>
20 #include <cfenv>
21 #include <cstdint>
22 #include <cstring>
23 
24 #include "berberis/base/bit_util.h"
25 #include "berberis/base/checks.h"
26 #include "berberis/base/macros.h"
27 #include "berberis/decoder/riscv64/decoder.h"
28 #include "berberis/decoder/riscv64/semantics_player.h"
29 #include "berberis/guest_state/guest_addr.h"
30 #include "berberis/guest_state/guest_state.h"
31 #include "berberis/intrinsics/guest_cpu_flags.h"  // ToHostRoundingMode
32 #include "berberis/intrinsics/intrinsics.h"
33 #include "berberis/intrinsics/intrinsics_float.h"
34 #include "berberis/intrinsics/riscv64/vector_intrinsics.h"
35 #include "berberis/intrinsics/simd_register.h"
36 #include "berberis/intrinsics/type_traits.h"
37 #include "berberis/kernel_api/run_guest_syscall.h"
38 #include "berberis/runtime_primitives/interpret_helpers.h"
39 #include "berberis/runtime_primitives/memory_region_reservation.h"
40 #include "berberis/runtime_primitives/recovery_code.h"
41 
42 #include "faulty_memory_accesses.h"
43 #include "regs.h"
44 
45 namespace berberis {
46 
AqRlToStdMemoryOrder(bool aq,bool rl)47 inline constexpr std::memory_order AqRlToStdMemoryOrder(bool aq, bool rl) {
48   if (aq) {
49     if (rl) {
50       return std::memory_order_acq_rel;
51     } else {
52       return std::memory_order_acquire;
53     }
54   } else {
55     if (rl) {
56       return std::memory_order_release;
57     } else {
58       return std::memory_order_relaxed;
59     }
60   }
61 }
62 
63 template <typename ConcreteType, template <auto> typename TemplateType>
64 inline constexpr bool IsTypeTemplateOf = false;
65 
66 template <template <auto> typename TemplateType, auto Value>
67 inline constexpr bool IsTypeTemplateOf<TemplateType<Value>, TemplateType> = true;
68 
69 class Interpreter {
70  public:
71   using CsrName = berberis::CsrName;
72   using Decoder = Decoder<SemanticsPlayer<Interpreter>>;
73   using Register = uint64_t;
74   using FpRegister = uint64_t;
75   using Float32 = intrinsics::Float32;
76   using Float64 = intrinsics::Float64;
77 
Interpreter(ThreadState * state)78   explicit Interpreter(ThreadState* state)
79       : state_(state), branch_taken_(false), exception_raised_(false) {}
80 
81   //
82   // Instruction implementations.
83   //
84 
UpdateCsr(Decoder::CsrOpcode opcode,Register arg,Register csr)85   Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr) {
86     switch (opcode) {
87       case Decoder::CsrOpcode::kCsrrs:
88         return arg | csr;
89       case Decoder::CsrOpcode::kCsrrc:
90         return ~arg & csr;
91       default:
92         Undefined();
93         return {};
94     }
95   }
96 
UpdateCsr(Decoder::CsrImmOpcode opcode,uint8_t imm,Register csr)97   Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr) {
98     return UpdateCsr(static_cast<Decoder::CsrOpcode>(opcode), imm, csr);
99   }
100 
101   // Note: we prefer not to use C11/C++ atomic_thread_fence or even gcc/clang builtin
102   // __atomic_thread_fence because all these function rely on the fact that compiler never uses
103   // non-temporal loads and stores and only issue “mfence” when sequentially consistent ordering is
104   // requested. They never issue “lfence” or “sfence”.
105   // Instead we pull the page from Linux's kernel book and map read ordereding to “lfence”, write
106   // ordering to “sfence” and read-write ordering to “mfence”.
107   // This can be important in the future if we would start using nontemporal moves in manually
108   // created assembly code.
109   // Ordering affecting I/O devices is not relevant to user-space code thus we just ignore bits
110   // related to devices I/O.
Fence(Decoder::FenceOpcode,Register,bool sw,bool sr,bool,bool,bool pw,bool pr,bool,bool)111   void Fence(Decoder::FenceOpcode /*opcode*/,
112              Register /*src*/,
113              bool sw,
114              bool sr,
115              bool /*so*/,
116              bool /*si*/,
117              bool pw,
118              bool pr,
119              bool /*po*/,
120              bool /*pi*/) {
121     bool read_fence = sr | pr;
122     bool write_fence = sw | pw;
123     // Two types of fences (total store ordering fence and normal fence) are supposed to be
124     // processed differently, but only for the “read_fence && write_fence” case (otherwise total
125     // store ordering fence becomes normal fence for the “forward compatibility”), yet because x86
126     // doesn't distinguish between these two types of fences and since we are supposed to map all
127     // not-yet defined fences to normal fence (again, for the “forward compatibility”) it's Ok to
128     // just ignore opcode field.
129     if (read_fence) {
130       if (write_fence) {
131         asm volatile("mfence" ::: "memory");
132       } else {
133         asm volatile("lfence" ::: "memory");
134       }
135     } else if (write_fence) {
136       asm volatile("sfence" ::: "memory");
137     }
138     return;
139   }
140 
141   template <typename IntType, bool aq, bool rl>
Lr(int64_t addr)142   Register Lr(int64_t addr) {
143     static_assert(std::is_integral_v<IntType>, "Lr: IntType must be integral");
144     static_assert(std::is_signed_v<IntType>, "Lr: IntType must be signed");
145     CHECK(!exception_raised_);
146     // Address must be aligned on size of IntType.
147     CHECK((addr % sizeof(IntType)) == 0ULL);
148     return MemoryRegionReservation::Load<IntType>(&state_->cpu, addr, AqRlToStdMemoryOrder(aq, rl));
149   }
150 
151   template <typename IntType, bool aq, bool rl>
Sc(int64_t addr,IntType val)152   Register Sc(int64_t addr, IntType val) {
153     static_assert(std::is_integral_v<IntType>, "Sc: IntType must be integral");
154     static_assert(std::is_signed_v<IntType>, "Sc: IntType must be signed");
155     CHECK(!exception_raised_);
156     // Address must be aligned on size of IntType.
157     CHECK((addr % sizeof(IntType)) == 0ULL);
158     return static_cast<Register>(MemoryRegionReservation::Store<IntType>(
159         &state_->cpu, addr, val, AqRlToStdMemoryOrder(aq, rl)));
160   }
161 
Op(Decoder::OpOpcode opcode,Register arg1,Register arg2)162   Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2) {
163     switch (opcode) {
164       case Decoder::OpOpcode::kAdd:
165         return Int64(arg1) + Int64(arg2);
166       case Decoder::OpOpcode::kSub:
167         return Int64(arg1) - Int64(arg2);
168       case Decoder::OpOpcode::kAnd:
169         return Int64(arg1) & Int64(arg2);
170       case Decoder::OpOpcode::kOr:
171         return Int64(arg1) | Int64(arg2);
172       case Decoder::OpOpcode::kXor:
173         return Int64(arg1) ^ Int64(arg2);
174       case Decoder::OpOpcode::kSll:
175         return Int64(arg1) << Int64(arg2);
176       case Decoder::OpOpcode::kSrl:
177         return UInt64(arg1) >> Int64(arg2);
178       case Decoder::OpOpcode::kSra:
179         return Int64(arg1) >> Int64(arg2);
180       case Decoder::OpOpcode::kSlt:
181         return Int64(arg1) < Int64(arg2) ? 1 : 0;
182       case Decoder::OpOpcode::kSltu:
183         return UInt64(arg1) < UInt64(arg2) ? 1 : 0;
184       case Decoder::OpOpcode::kMul:
185         return Int64(arg1) * Int64(arg2);
186       case Decoder::OpOpcode::kMulh:
187         return NarrowTopHalf(Widen(Int64(arg1)) * Widen(Int64(arg2)));
188       case Decoder::OpOpcode::kMulhsu:
189         return NarrowTopHalf(Widen(Int64(arg1)) * BitCastToSigned(Widen(UInt64(arg2))));
190       case Decoder::OpOpcode::kMulhu:
191         return NarrowTopHalf(Widen(UInt64(arg1)) * Widen(UInt64(arg2)));
192       case Decoder::OpOpcode::kAndn:
193         return Int64(arg1) & (~Int64(arg2));
194       case Decoder::OpOpcode::kOrn:
195         return Int64(arg1) | (~Int64(arg2));
196       case Decoder::OpOpcode::kXnor:
197         return ~(Int64(arg1) ^ Int64(arg2));
198       default:
199         Undefined();
200         return {};
201     }
202   }
203 
Op32(Decoder::Op32Opcode opcode,Register arg1,Register arg2)204   Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2) {
205     switch (opcode) {
206       case Decoder::Op32Opcode::kAddw:
207         return Widen(TruncateTo<Int32>(arg1) + TruncateTo<Int32>(arg2));
208       case Decoder::Op32Opcode::kSubw:
209         return Widen(TruncateTo<Int32>(arg1) - TruncateTo<Int32>(arg2));
210       case Decoder::Op32Opcode::kSllw:
211         return Widen(TruncateTo<Int32>(arg1) << TruncateTo<Int32>(arg2));
212       case Decoder::Op32Opcode::kSrlw:
213         return Widen(BitCastToSigned(TruncateTo<UInt32>(arg1) >> TruncateTo<Int32>(arg2)));
214       case Decoder::Op32Opcode::kSraw:
215         return Widen(TruncateTo<Int32>(arg1) >> TruncateTo<Int32>(arg2));
216       case Decoder::Op32Opcode::kMulw:
217         return Widen(TruncateTo<Int32>(arg1) * TruncateTo<Int32>(arg2));
218       default:
219         Undefined();
220         return {};
221     }
222   }
223 
Load(Decoder::LoadOperandType operand_type,Register arg,int16_t offset)224   Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset) {
225     void* ptr = ToHostAddr<void>(arg + offset);
226     switch (operand_type) {
227       case Decoder::LoadOperandType::k8bitUnsigned:
228         return Load<uint8_t>(ptr);
229       case Decoder::LoadOperandType::k16bitUnsigned:
230         return Load<uint16_t>(ptr);
231       case Decoder::LoadOperandType::k32bitUnsigned:
232         return Load<uint32_t>(ptr);
233       case Decoder::LoadOperandType::k64bit:
234         return Load<uint64_t>(ptr);
235       case Decoder::LoadOperandType::k8bitSigned:
236         return Load<int8_t>(ptr);
237       case Decoder::LoadOperandType::k16bitSigned:
238         return Load<int16_t>(ptr);
239       case Decoder::LoadOperandType::k32bitSigned:
240         return Load<int32_t>(ptr);
241       default:
242         Undefined();
243         return {};
244     }
245   }
246 
247   template <typename DataType>
LoadFp(Register arg,int16_t offset)248   FpRegister LoadFp(Register arg, int16_t offset) {
249     static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
250     CHECK(!exception_raised_);
251     DataType* ptr = ToHostAddr<DataType>(arg + offset);
252     FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
253     if (result.is_fault) {
254       exception_raised_ = true;
255       return {};
256     }
257     return result.value;
258   }
259 
OpImm(Decoder::OpImmOpcode opcode,Register arg,int16_t imm)260   Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm) {
261     switch (opcode) {
262       case Decoder::OpImmOpcode::kAddi:
263         return arg + int64_t{imm};
264       case Decoder::OpImmOpcode::kSlti:
265         return bit_cast<int64_t>(arg) < int64_t{imm} ? 1 : 0;
266       case Decoder::OpImmOpcode::kSltiu:
267         return arg < bit_cast<uint64_t>(int64_t{imm}) ? 1 : 0;
268       case Decoder::OpImmOpcode::kXori:
269         return arg ^ int64_t { imm };
270       case Decoder::OpImmOpcode::kOri:
271         return arg | int64_t{imm};
272       case Decoder::OpImmOpcode::kAndi:
273         return arg & int64_t{imm};
274       default:
275         Undefined();
276         return {};
277     }
278   }
279 
Lui(int32_t imm)280   Register Lui(int32_t imm) { return int64_t{imm}; }
281 
Auipc(int32_t imm)282   Register Auipc(int32_t imm) {
283     uint64_t pc = state_->cpu.insn_addr;
284     return pc + int64_t{imm};
285   }
286 
OpImm32(Decoder::OpImm32Opcode opcode,Register arg,int16_t imm)287   Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm) {
288     switch (opcode) {
289       case Decoder::OpImm32Opcode::kAddiw:
290         return int32_t(arg) + int32_t{imm};
291       default:
292         Undefined();
293         return {};
294     }
295   }
296 
297   // TODO(b/232598137): rework ecall to not take parameters explicitly.
Ecall(Register,Register,Register,Register,Register,Register,Register)298   Register Ecall(Register /* syscall_nr */,
299                  Register /* arg0 */,
300                  Register /* arg1 */,
301                  Register /* arg2 */,
302                  Register /* arg3 */,
303                  Register /* arg4 */,
304                  Register /* arg5 */) {
305     CHECK(!exception_raised_);
306     RunGuestSyscall(state_);
307     return state_->cpu.x[A0];
308   }
309 
Slli(Register arg,int8_t imm)310   Register Slli(Register arg, int8_t imm) { return arg << imm; }
311 
Srli(Register arg,int8_t imm)312   Register Srli(Register arg, int8_t imm) { return arg >> imm; }
313 
Srai(Register arg,int8_t imm)314   Register Srai(Register arg, int8_t imm) { return bit_cast<int64_t>(arg) >> imm; }
315 
ShiftImm32(Decoder::ShiftImm32Opcode opcode,Register arg,uint16_t imm)316   Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm) {
317     switch (opcode) {
318       case Decoder::ShiftImm32Opcode::kSlliw:
319         return int32_t(arg) << int32_t{imm};
320       case Decoder::ShiftImm32Opcode::kSrliw:
321         return bit_cast<int32_t>(uint32_t(arg) >> uint32_t{imm});
322       case Decoder::ShiftImm32Opcode::kSraiw:
323         return int32_t(arg) >> int32_t{imm};
324       default:
325         Undefined();
326         return {};
327     }
328   }
329 
Rori(Register arg,int8_t shamt)330   Register Rori(Register arg, int8_t shamt) {
331     CheckShamtIsValid(shamt);
332     return (((uint64_t(arg) >> shamt)) | (uint64_t(arg) << (64 - shamt)));
333   }
334 
Roriw(Register arg,int8_t shamt)335   Register Roriw(Register arg, int8_t shamt) {
336     CheckShamt32IsValid(shamt);
337     return int32_t(((uint32_t(arg) >> shamt)) | (uint32_t(arg) << (32 - shamt)));
338   }
339 
Store(Decoder::MemoryDataOperandType operand_type,Register arg,int16_t offset,Register data)340   void Store(Decoder::MemoryDataOperandType operand_type,
341              Register arg,
342              int16_t offset,
343              Register data) {
344     void* ptr = ToHostAddr<void>(arg + offset);
345     switch (operand_type) {
346       case Decoder::MemoryDataOperandType::k8bit:
347         Store<uint8_t>(ptr, data);
348         break;
349       case Decoder::MemoryDataOperandType::k16bit:
350         Store<uint16_t>(ptr, data);
351         break;
352       case Decoder::MemoryDataOperandType::k32bit:
353         Store<uint32_t>(ptr, data);
354         break;
355       case Decoder::MemoryDataOperandType::k64bit:
356         Store<uint64_t>(ptr, data);
357         break;
358       default:
359         return Undefined();
360     }
361   }
362 
363   template <typename DataType>
StoreFp(Register arg,int16_t offset,FpRegister data)364   void StoreFp(Register arg, int16_t offset, FpRegister data) {
365     static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
366     CHECK(!exception_raised_);
367     DataType* ptr = ToHostAddr<DataType>(arg + offset);
368     exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
369   }
370 
CompareAndBranch(Decoder::BranchOpcode opcode,Register arg1,Register arg2,int16_t offset)371   void CompareAndBranch(Decoder::BranchOpcode opcode,
372                         Register arg1,
373                         Register arg2,
374                         int16_t offset) {
375     bool cond_value;
376     switch (opcode) {
377       case Decoder::BranchOpcode::kBeq:
378         cond_value = arg1 == arg2;
379         break;
380       case Decoder::BranchOpcode::kBne:
381         cond_value = arg1 != arg2;
382         break;
383       case Decoder::BranchOpcode::kBltu:
384         cond_value = arg1 < arg2;
385         break;
386       case Decoder::BranchOpcode::kBgeu:
387         cond_value = arg1 >= arg2;
388         break;
389       case Decoder::BranchOpcode::kBlt:
390         cond_value = bit_cast<int64_t>(arg1) < bit_cast<int64_t>(arg2);
391         break;
392       case Decoder::BranchOpcode::kBge:
393         cond_value = bit_cast<int64_t>(arg1) >= bit_cast<int64_t>(arg2);
394         break;
395       default:
396         return Undefined();
397     }
398 
399     if (cond_value) {
400       Branch(offset);
401     }
402   }
403 
Branch(int32_t offset)404   void Branch(int32_t offset) {
405     CHECK(!exception_raised_);
406     state_->cpu.insn_addr += offset;
407     branch_taken_ = true;
408   }
409 
BranchRegister(Register base,int16_t offset)410   void BranchRegister(Register base, int16_t offset) {
411     CHECK(!exception_raised_);
412     state_->cpu.insn_addr = (base + offset) & ~uint64_t{1};
413     branch_taken_ = true;
414   }
415 
Fmv(FpRegister arg)416   FpRegister Fmv(FpRegister arg) { return arg; }
417 
418   //
419   // V extensions.
420   //
421 
422   using TailProcessing = intrinsics::TailProcessing;
423   using InactiveProcessing = intrinsics::InactiveProcessing;
424 
425   enum class VectorSelectElementWidth {
426     k8bit = 0b000,
427     k16bit = 0b001,
428     k32bit = 0b010,
429     k64bit = 0b011,
430     kMaxValue = 0b111,
431   };
432 
433   enum class VectorRegisterGroupMultiplier {
434     k1register = 0b000,
435     k2registers = 0b001,
436     k4registers = 0b010,
437     k8registers = 0b011,
438     kEigthOfRegister = 0b101,
439     kQuarterOfRegister = 0b110,
440     kHalfOfRegister = 0b111,
441     kMaxValue = 0b111,
442   };
443 
NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul)444   static constexpr size_t NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul) {
445     switch (vlmul) {
446       case VectorRegisterGroupMultiplier::k2registers:
447         return 2;
448       case VectorRegisterGroupMultiplier::k4registers:
449         return 4;
450       case VectorRegisterGroupMultiplier::k8registers:
451         return 8;
452       default:
453         return 1;
454     }
455   }
456 
NumRegistersInvolvedForWideOperand(VectorRegisterGroupMultiplier vlmul)457   static constexpr size_t NumRegistersInvolvedForWideOperand(VectorRegisterGroupMultiplier vlmul) {
458     switch (vlmul) {
459       case VectorRegisterGroupMultiplier::k1register:
460         return 2;
461       case VectorRegisterGroupMultiplier::k2registers:
462         return 4;
463       case VectorRegisterGroupMultiplier::k4registers:
464         return 8;
465       default:
466         return 1;
467     }
468   }
469 
470   template <typename ElementType, VectorRegisterGroupMultiplier vlmul>
GetVlmax()471   static constexpr size_t GetVlmax() {
472     constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
473     switch (vlmul) {
474       case VectorRegisterGroupMultiplier::k1register:
475         return kElementsCount;
476       case VectorRegisterGroupMultiplier::k2registers:
477         return 2 * kElementsCount;
478       case VectorRegisterGroupMultiplier::k4registers:
479         return 4 * kElementsCount;
480       case VectorRegisterGroupMultiplier::k8registers:
481         return 8 * kElementsCount;
482       case VectorRegisterGroupMultiplier::kEigthOfRegister:
483         return kElementsCount / 8;
484       case VectorRegisterGroupMultiplier::kQuarterOfRegister:
485         return kElementsCount / 4;
486       case VectorRegisterGroupMultiplier::kHalfOfRegister:
487         return kElementsCount / 2;
488       default:
489         return 0;
490     }
491   }
492 
493   template <typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,ExtraArgs...extra_args)494   void OpVector(const VOpArgs& args, ExtraArgs... extra_args) {
495     // Note: whole register instructions are not dependent on vtype and are supposed to work even
496     // if vill is set!  Handle them before processing other instructions.
497     // Note: other tupes of loads and store are not special and would be processed as usual.
498     // TODO(khim): Handle vstart properly.
499     if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs>) {
500       if (args.opcode == Decoder::VLUmOpOpcode::kVlXreXX) {
501         if (!IsPowerOf2(args.nf + 1)) {
502           return Undefined();
503         }
504         if ((args.dst & args.nf) != 0) {
505           return Undefined();
506         }
507         auto [src] = std::tuple{extra_args...};
508         __uint128_t* ptr = bit_cast<__uint128_t*>(src);
509         for (size_t index = 0; index <= args.nf; index++) {
510           state_->cpu.v[args.dst + index] = ptr[index];
511         }
512         return;
513       }
514     }
515 
516     if constexpr (std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
517       if (args.opcode == Decoder::VSUmOpOpcode::kVsX) {
518         if (args.width != Decoder::MemoryDataOperandType::k8bit) {
519           return Undefined();
520         }
521         if (!IsPowerOf2(args.nf + 1)) {
522           return Undefined();
523         }
524         if ((args.data & args.nf) != 0) {
525           return Undefined();
526         }
527         auto [src] = std::tuple{extra_args...};
528         __uint128_t* ptr = bit_cast<__uint128_t*>(src);
529         for (size_t index = 0; index <= args.nf; index++) {
530           ptr[index] = state_->cpu.v[args.data + index];
531         }
532         return;
533       }
534     }
535 
536     // RISC-V V extensions are using 8bit “opcode extension” vtype Csr to make sure 32bit encoding
537     // would be usable.
538     //
539     // Great care is made to ensure that vector code wouldn't need to change vtype Csr often (e.g.
540     // there are special mask instructions which allow one to manipulate on masks without the need
541     // to change the CPU mode.
542     //
543     // Currently we don't have support for multiple CPU mode in Berberis thus we can only handle
544     // these instrtuctions in the interpreter.
545     //
546     // TODO(b/300690740): develop and implement strategy which would allow us to support vector
547     // intrinsics not just in the interpreter. Move code from this function to semantics player.
548     Register vtype = GetCsr<CsrName::kVtype>();
549     if (static_cast<std::make_signed_t<Register>>(vtype) < 0) {
550       return Undefined();
551     }
552     if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
553                   std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
554                   std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
555                   std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
556                   std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
557                   std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
558       switch (args.width) {
559         case Decoder::MemoryDataOperandType::k8bit:
560           return OpVector<UInt8>(args, vtype, extra_args...);
561         case Decoder::MemoryDataOperandType::k16bit:
562           return OpVector<UInt16>(args, vtype, extra_args...);
563         case Decoder::MemoryDataOperandType::k32bit:
564           return OpVector<UInt32>(args, vtype, extra_args...);
565         case Decoder::MemoryDataOperandType::k64bit:
566           return OpVector<UInt64>(args, vtype, extra_args...);
567         default:
568           return Undefined();
569       }
570     } else {
571       VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0x7);
572       if constexpr (std::is_same_v<VOpArgs, Decoder::VOpFVfArgs> ||
573                     std::is_same_v<VOpArgs, Decoder::VOpFVvArgs>) {
574         switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
575           case VectorSelectElementWidth::k16bit:
576             if constexpr (sizeof...(extra_args) == 0) {
577               return OpVector<intrinsics::Float16>(args, vlmul, vtype);
578             } else {
579               return Undefined();
580             }
581           case VectorSelectElementWidth::k32bit:
582             return OpVector<Float32>(
583                 args,
584                 vlmul,
585                 vtype,
586                 std::get<0>(intrinsics::UnboxNan<Float32>(bit_cast<Float64>(extra_args)))...);
587           case VectorSelectElementWidth::k64bit:
588             // Note: if arguments are 64bit floats then we don't need to do any unboxing.
589             return OpVector<Float64>(args, vlmul, vtype, bit_cast<Float64>(extra_args)...);
590           default:
591             return Undefined();
592         }
593       } else {
594         switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
595           case VectorSelectElementWidth::k8bit:
596             return OpVector<UInt8>(args, vlmul, vtype, extra_args...);
597           case VectorSelectElementWidth::k16bit:
598             return OpVector<UInt16>(args, vlmul, vtype, extra_args...);
599           case VectorSelectElementWidth::k32bit:
600             return OpVector<UInt32>(args, vlmul, vtype, extra_args...);
601           case VectorSelectElementWidth::k64bit:
602             return OpVector<UInt64>(args, vlmul, vtype, extra_args...);
603           default:
604             return Undefined();
605         }
606       }
607     }
608   }
609 
610   template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)611   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
612     auto vemul = Decoder::SignExtend<3>(vtype & 0b111);
613     vemul -= ((vtype >> 3) & 0b111);        // Divide by SEW.
614     vemul +=
615         static_cast<std::underlying_type_t<decltype(args.width)>>(args.width);  // Multiply by EEW.
616     if (vemul < -3 || vemul > 3) [[unlikely]] {
617       return Undefined();
618     }
619     // Note: whole register loads and stores treat args.nf differently, but they are processed
620     // separately above anyway, because they also ignore vtype and all the information in it!
621     // For other loads and stores affected number of registers (EMUL * NF) should be 8 or less.
622     if ((vemul > 0) && ((args.nf + 1) * (1 << vemul) > 8)) {
623       return Undefined();
624     }
625     return OpVector<ElementType>(
626         args, static_cast<VectorRegisterGroupMultiplier>(vemul & 0b111), vtype, extra_args...);
627   }
628 
629   template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,VectorRegisterGroupMultiplier vlmul,Register vtype,ExtraArgs...extra_args)630   void OpVector(const VOpArgs& args,
631                 VectorRegisterGroupMultiplier vlmul,
632                 Register vtype,
633                 ExtraArgs... extra_args) {
634     switch (vlmul) {
635       case VectorRegisterGroupMultiplier::k1register:
636         return OpVector<ElementType, VectorRegisterGroupMultiplier::k1register>(
637             args, vtype, extra_args...);
638       case VectorRegisterGroupMultiplier::k2registers:
639         return OpVector<ElementType, VectorRegisterGroupMultiplier::k2registers>(
640             args, vtype, extra_args...);
641       case VectorRegisterGroupMultiplier::k4registers:
642         return OpVector<ElementType, VectorRegisterGroupMultiplier::k4registers>(
643             args, vtype, extra_args...);
644       case VectorRegisterGroupMultiplier::k8registers:
645         return OpVector<ElementType, VectorRegisterGroupMultiplier::k8registers>(
646             args, vtype, extra_args...);
647       case VectorRegisterGroupMultiplier::kEigthOfRegister:
648         return OpVector<ElementType, VectorRegisterGroupMultiplier::kEigthOfRegister>(
649             args, vtype, extra_args...);
650       case VectorRegisterGroupMultiplier::kQuarterOfRegister:
651         return OpVector<ElementType, VectorRegisterGroupMultiplier::kQuarterOfRegister>(
652             args, vtype, extra_args...);
653       case VectorRegisterGroupMultiplier::kHalfOfRegister:
654         return OpVector<ElementType, VectorRegisterGroupMultiplier::kHalfOfRegister>(
655             args, vtype, extra_args...);
656       default:
657         return Undefined();
658     }
659   }
660 
661   template <typename ElementType,
662             VectorRegisterGroupMultiplier vlmul,
663             typename VOpArgs,
664             typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)665   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
666     if (args.vm) {
667       return OpVector<ElementType, vlmul, intrinsics::NoInactiveProcessing{}>(
668           args, vtype, extra_args...);
669     }
670     if (vtype >> 7) {
671       return OpVector<ElementType, vlmul, InactiveProcessing::kAgnostic>(
672           args, vtype, extra_args...);
673     }
674     return OpVector<ElementType, vlmul, InactiveProcessing::kUndisturbed>(
675         args, vtype, extra_args...);
676   }
677 
678   template <typename ElementType,
679             VectorRegisterGroupMultiplier vlmul,
680             auto vma,
681             typename VOpArgs,
682             typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)683   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
684     if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
685                   std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
686                   std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
687                   std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
688                   std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
689                   std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
690       constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
691       // Note: whole register loads and stores treat args.nf differently, but they are processed
692       // separately above anyway, because they also ignore vtype and all the information in it!
693       switch (args.nf) {
694         case 0:
695           return OpVector<ElementType, 1, vlmul, vma>(args, vtype, extra_args...);
696         case 1:
697           if constexpr (kRegistersInvolved > 4) {
698             return Undefined();
699           } else {
700             return OpVector<ElementType, 2, vlmul, vma>(args, vtype, extra_args...);
701           }
702         case 2:
703           if constexpr (kRegistersInvolved > 2) {
704             return Undefined();
705           } else {
706             return OpVector<ElementType, 3, vlmul, vma>(args, vtype, extra_args...);
707           }
708         case 3:
709           if constexpr (kRegistersInvolved > 2) {
710             return Undefined();
711           } else {
712             return OpVector<ElementType, 4, vlmul, vma>(args, vtype, extra_args...);
713           }
714         case 4:
715           if constexpr (kRegistersInvolved > 1) {
716             return Undefined();
717           } else {
718             return OpVector<ElementType, 5, vlmul, vma>(args, vtype, extra_args...);
719           }
720         case 5:
721           if constexpr (kRegistersInvolved > 1) {
722             return Undefined();
723           } else {
724             return OpVector<ElementType, 6, vlmul, vma>(args, vtype, extra_args...);
725           }
726         case 6:
727           if constexpr (kRegistersInvolved > 1) {
728             return Undefined();
729           } else {
730             return OpVector<ElementType, 7, vlmul, vma>(args, vtype, extra_args...);
731           }
732         case 7:
733           if constexpr (kRegistersInvolved > 1) {
734             return Undefined();
735           } else {
736             return OpVector<ElementType, 8, vlmul, vma>(args, vtype, extra_args...);
737           }
738       }
739     } else {
740       if ((vtype >> 6) & 1) {
741         return OpVector<ElementType, vlmul, TailProcessing::kAgnostic, vma>(args, extra_args...);
742       }
743       return OpVector<ElementType, vlmul, TailProcessing::kUndisturbed, vma>(args, extra_args...);
744     }
745   }
746 
747   template <typename ElementType,
748             size_t kSegmentSize,
749             VectorRegisterGroupMultiplier vlmul,
750             auto vma,
751             typename VOpArgs,
752             typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)753   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
754     // Indexed loads and stores have two operands with different ElementType's and lmul sizes,
755     // pass vtype to do further selection.
756     if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
757                   std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs>) {
758       // Because we know that we are dealing with indexed loads and stores and wouldn't need to
759       // convert elmul to anything else we can immediately turn it into kIndexRegistersInvolved
760       // here.
761       if ((vtype >> 6) & 1) {
762         return OpVector<kSegmentSize,
763                         ElementType,
764                         NumberOfRegistersInvolved(vlmul),
765                         TailProcessing::kAgnostic,
766                         vma>(args, vtype, extra_args...);
767       }
768       return OpVector<kSegmentSize,
769                       ElementType,
770                       NumberOfRegistersInvolved(vlmul),
771                       TailProcessing::kUndisturbed,
772                       vma>(args, vtype, extra_args...);
773     } else {
774       // For other instruction we have parsed all the information from vtype and only need to pass
775       // args and extra_args.
776       if ((vtype >> 6) & 1) {
777         return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kAgnostic, vma>(
778             args, extra_args...);
779       }
780       return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kUndisturbed, vma>(
781           args, extra_args...);
782     }
783   }
784 
785   template <size_t kSegmentSize,
786             typename IndexElementType,
787             size_t kIndexRegistersInvolved,
788             TailProcessing vta,
789             auto vma,
790             typename VOpArgs,
791             typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)792   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
793     VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0b111);
794     switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
795       case VectorSelectElementWidth::k8bit:
796         return OpVector<UInt8, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
797             args, vlmul, extra_args...);
798       case VectorSelectElementWidth::k16bit:
799         return OpVector<UInt16, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
800             args, vlmul, extra_args...);
801       case VectorSelectElementWidth::k32bit:
802         return OpVector<UInt32, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
803             args, vlmul, extra_args...);
804       case VectorSelectElementWidth::k64bit:
805         return OpVector<UInt64, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
806             args, vlmul, extra_args...);
807       default:
808         return Undefined();
809     }
810   }
811 
812   template <typename DataElementType,
813             size_t kSegmentSize,
814             typename IndexElementType,
815             size_t kIndexRegistersInvolved,
816             TailProcessing vta,
817             auto vma,
818             typename VOpArgs,
819             typename... ExtraArgs>
OpVector(const VOpArgs & args,VectorRegisterGroupMultiplier vlmul,ExtraArgs...extra_args)820   void OpVector(const VOpArgs& args, VectorRegisterGroupMultiplier vlmul, ExtraArgs... extra_args) {
821     switch (vlmul) {
822       case VectorRegisterGroupMultiplier::k1register:
823         return OpVector<DataElementType,
824                         VectorRegisterGroupMultiplier::k1register,
825                         IndexElementType,
826                         kSegmentSize,
827                         kIndexRegistersInvolved,
828                         vta,
829                         vma>(args, extra_args...);
830       case VectorRegisterGroupMultiplier::k2registers:
831         return OpVector<DataElementType,
832                         VectorRegisterGroupMultiplier::k2registers,
833                         IndexElementType,
834                         kSegmentSize,
835                         kIndexRegistersInvolved,
836                         vta,
837                         vma>(args, extra_args...);
838       case VectorRegisterGroupMultiplier::k4registers:
839         return OpVector<DataElementType,
840                         VectorRegisterGroupMultiplier::k4registers,
841                         IndexElementType,
842                         kSegmentSize,
843                         kIndexRegistersInvolved,
844                         vta,
845                         vma>(args, extra_args...);
846       case VectorRegisterGroupMultiplier::k8registers:
847         return OpVector<DataElementType,
848                         VectorRegisterGroupMultiplier::k8registers,
849                         IndexElementType,
850                         kSegmentSize,
851                         kIndexRegistersInvolved,
852                         vta,
853                         vma>(args, extra_args...);
854       case VectorRegisterGroupMultiplier::kEigthOfRegister:
855         return OpVector<DataElementType,
856                         VectorRegisterGroupMultiplier::kEigthOfRegister,
857                         IndexElementType,
858                         kSegmentSize,
859                         kIndexRegistersInvolved,
860                         vta,
861                         vma>(args, extra_args...);
862       case VectorRegisterGroupMultiplier::kQuarterOfRegister:
863         return OpVector<DataElementType,
864                         VectorRegisterGroupMultiplier::kQuarterOfRegister,
865                         IndexElementType,
866                         kSegmentSize,
867                         kIndexRegistersInvolved,
868                         vta,
869                         vma>(args, extra_args...);
870       case VectorRegisterGroupMultiplier::kHalfOfRegister:
871         return OpVector<DataElementType,
872                         VectorRegisterGroupMultiplier::kHalfOfRegister,
873                         IndexElementType,
874                         kSegmentSize,
875                         kIndexRegistersInvolved,
876                         vta,
877                         vma>(args, extra_args...);
878       default:
879         return Undefined();
880     }
881   }
882 
883   // CSR registers, that are permitted as an argument of strip-mining instrinsic.
884   using CsrName::kFrm;
885   using CsrName::kVxrm;
886   using CsrName::kVxsat;
887   // Argument of OpVectorXXX function is the number of vector register group.
888   template <auto DefaultElement = intrinsics::NoInactiveProcessing{}>
889   struct Vec {
890     uint8_t start_no;
891   };
892   // Vector argument 2x wide (for narrowing and widening instructions).
893   template <auto DefaultElement = intrinsics::NoInactiveProcessing{}>
894   struct WideVec {
895     uint8_t start_no;
896   };
897 
898   template <typename DataElementType,
899             VectorRegisterGroupMultiplier vlmul,
900             typename IndexElementType,
901             size_t kSegmentSize,
902             size_t kIndexRegistersInvolved,
903             TailProcessing vta,
904             auto vma>
OpVector(const Decoder::VLoadIndexedArgs & args,Register src)905   void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
906     return OpVector<DataElementType,
907                     kSegmentSize,
908                     NumberOfRegistersInvolved(vlmul),
909                     IndexElementType,
910                     kIndexRegistersInvolved,
911                     vta,
912                     vma>(args, src);
913   }
914 
915   template <typename DataElementType,
916             size_t kSegmentSize,
917             size_t kNumRegistersInGroup,
918             typename IndexElementType,
919             size_t kIndexRegistersInvolved,
920             TailProcessing vta,
921             auto vma>
OpVector(const Decoder::VLoadIndexedArgs & args,Register src)922   void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
923     if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
924       return Undefined();
925     }
926     constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
927     alignas(alignof(SIMD128Register))
928         IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
929     memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
930     return OpVectorLoad<DataElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
931         args.dst, src, [&indexes](size_t index) { return indexes[index]; });
932   }
933 
934   template <typename ElementType,
935             size_t kSegmentSize,
936             VectorRegisterGroupMultiplier vlmul,
937             TailProcessing vta,
938             auto vma>
OpVector(const Decoder::VLoadStrideArgs & args,Register src,Register stride)939   void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
940     return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(
941         args, src, stride);
942   }
943 
944   template <typename ElementType,
945             size_t kSegmentSize,
946             size_t kNumRegistersInGroup,
947             TailProcessing vta,
948             auto vma>
OpVector(const Decoder::VLoadStrideArgs & args,Register src,Register stride)949   void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
950     return OpVectorLoad<ElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
951         args.dst, src, [stride](size_t index) { return stride * index; });
952   }
953 
954   template <typename ElementType,
955             size_t kSegmentSize,
956             VectorRegisterGroupMultiplier vlmul,
957             TailProcessing vta,
958             auto vma>
OpVector(const Decoder::VLoadUnitStrideArgs & args,Register src)959   void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
960     return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(args,
961                                                                                            src);
962   }
963 
964   template <typename ElementType,
965             size_t kSegmentSize,
966             size_t kNumRegistersInGroup,
967             TailProcessing vta,
968             auto vma>
OpVector(const Decoder::VLoadUnitStrideArgs & args,Register src)969   void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
970     switch (args.opcode) {
971       case Decoder::VLUmOpOpcode::kVleXXff:
972         return OpVectorLoad<ElementType,
973                             kSegmentSize,
974                             kNumRegistersInGroup,
975                             vta,
976                             vma,
977                             Decoder::VLUmOpOpcode::kVleXXff>(
978             args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
979       case Decoder::VLUmOpOpcode::kVleXX:
980         return OpVectorLoad<ElementType,
981                             kSegmentSize,
982                             kNumRegistersInGroup,
983                             vta,
984                             vma,
985                             Decoder::VLUmOpOpcode::kVleXX>(
986             args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
987       case Decoder::VLUmOpOpcode::kVlm:
988         if constexpr (kSegmentSize == 1 &&
989                       std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
990           return OpVectorLoad<UInt8,
991                               1,
992                               1,
993                               TailProcessing::kAgnostic,
994                               vma,
995                               Decoder::VLUmOpOpcode::kVlm>(
996               args.dst, src, [](size_t index) { return index; });
997         }
998         return Undefined();
999       default:
1000         return Undefined();
1001     }
1002   }
1003 
1004   // The strided version of segmented load sounds like something very convoluted and complicated
1005   // that no one may ever want to use, but it's not rare and may be illustrated with simple RGB
1006   // bitmap window.
1007   //
1008   // Suppose it's in memory like this (doubles are 8 bytes in size as per IEEE 754)):
1009   //   {R: 0.01}{G: 0.11}{B: 0.21} {R: 1.01}{G: 1.11}{B: 1.21}, {R: 2.01}{G: 2.11}{B: 2.21}
1010   //   {R:10.01}{G:10.11}{B:10.21} {R:11.01}{G:11.11}{B:11.21}, {R:12.01}{G:12.11}{B:12.21}
1011   //   {R:20.01}{G:20.11}{B:20.21} {R:21.01}{G:21.11}{B:21.21}, {R:22.01}{G:22.11}{B:22.21}
1012   //   {R:30.01}{G:30.11}{B:30.21} {R:31.01}{G:31.11}{B:31.21}, {R:32.01}{G:32.11}{B:32.21}
1013   // This is very tiny 3x4 image with 3 components: red, green, blue.
1014   //
1015   // Let's assume that x1 is loaded with address of first element and x2 with 72 (that's how much
1016   // one row of this image takes).
1017   //
1018   // Then we may use the following command to load values in memory (with LMUL = 2, ELEN = 4):
1019   //   vlsseg3e64.v v0, (x1), x2
1020   //
1021   // They would be loaded like this:
1022   //   v0: {R: 0.01}{R:10.01} (first group of 2 registers)
1023   //   v1: {R:20.01}{R:30.01}
1024   //   v2: {G: 0.11}{G:10.11} (second group of 2 registers)
1025   //   v3: {G:20.11}{G:30.11}
1026   //   v4: {B: 0.21}{B:10.21} (third group of 3 registers)
1027   //   v5: {B:20.21}{B:30.21}
1028   // Now we have loaded a column from memory and all three colors are put into a different register
1029   // groups for further processing.
1030   template <typename ElementType,
1031             size_t kSegmentSize,
1032             size_t kNumRegistersInGroup,
1033             TailProcessing vta,
1034             auto vma,
1035             typename Decoder::VLUmOpOpcode opcode = typename Decoder::VLUmOpOpcode{},
1036             typename GetElementOffsetLambdaType>
1037   void OpVectorLoad(uint8_t dst, Register src, GetElementOffsetLambdaType GetElementOffset) {
1038     using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
1039     if (!IsAligned<kNumRegistersInGroup>(dst)) {
1040       return Undefined();
1041     }
1042     if (dst + kNumRegistersInGroup * kSegmentSize > 32) {
1043       return Undefined();
1044     }
1045     constexpr size_t kElementsCount = 16 / sizeof(ElementType);
1046     size_t vstart = GetCsr<CsrName::kVstart>();
1047     size_t vl = GetCsr<CsrName::kVl>();
1048     if constexpr (opcode == Decoder::VLUmOpOpcode::kVlm) {
1049       vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
1050     }
1051     // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
1052     // simplify the logic below.
1053     SetCsr<CsrName::kVstart>(0);
1054     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
1055     // vector register group, including that no tail elements are updated with agnostic values.
1056     if (vstart >= vl) [[unlikely]] {
1057       return;
1058     }
1059     if constexpr (vta == TailProcessing::kAgnostic) {
1060       vstart = std::min(vstart, vl);
1061     }
1062     // Note: within_group_id is the current register id within a register group. During one
1063     // iteration of this loop we compute results for all registers with the current id in all
1064     // groups. E.g. for the example above we'd compute v0, v2, v4 during the first iteration (id
1065     // within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
1066     // ensures that memory is always accessed in ordered fashion.
1067     std::array<SIMD128Register, kSegmentSize> result;
1068     char* ptr = ToHostAddr<char>(src);
1069     auto mask = GetMaskForVectorOperations<vma>();
1070     for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
1071          ++within_group_id) {
1072       // No need to continue if we have kUndisturbed vta strategy.
1073       if constexpr (vta == TailProcessing::kUndisturbed) {
1074         if (within_group_id * kElementsCount >= vl) {
1075           break;
1076         }
1077       }
1078       // If we have elements that won't be overwritten then load these from registers.
1079       // For interpreter we could have filled all the registers unconditionally but we'll want to
1080       // reuse this code JITs later.
1081       auto register_mask =
1082           std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
1083       auto full_mask = std::get<0>(intrinsics::FullMaskForRegister<ElementType>(mask));
1084       if (vstart ||
1085           (vl < (within_group_id + 1) * kElementsCount && vta == TailProcessing::kUndisturbed) ||
1086           !(std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing> ||
1087             static_cast<InactiveProcessing>(vma) != InactiveProcessing::kUndisturbed ||
1088             register_mask == full_mask)) {
1089         for (size_t field = 0; field < kSegmentSize; ++field) {
1090           result[field].Set(state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup]);
1091         }
1092       }
1093       // Read elements from memory, but only if there are any active ones.
1094       for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
1095            ++within_register_id) {
1096         size_t element_index = kElementsCount * within_group_id + within_register_id;
1097         // Stop if we reached the vl limit.
1098         if (vl <= element_index) {
1099           break;
1100         }
1101         // Don't touch masked-out elements.
1102         if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1103           if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
1104                                              1 << within_register_id)}) == MaskType{0}) {
1105             continue;
1106           }
1107         }
1108         // Load segment from memory.
1109         for (size_t field = 0; field < kSegmentSize; ++field) {
1110           FaultyLoadResult mem_access_result =
1111               FaultyLoad(ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
1112                          sizeof(ElementType));
1113           if (mem_access_result.is_fault) {
1114             // Documentation doesn't tell us what we are supposed to do to remaining elements when
1115             // access fault happens but let's trigger an exception and treat the remaining elements
1116             // using vta-specified strategy by simply just adjusting the vl.
1117             vl = element_index;
1118             if constexpr (opcode == Decoder::VLUmOpOpcode::kVleXXff) {
1119               // Fail-first load only triggers exceptions for the first element, otherwise it
1120               // changes vl to ensure that other operations would only process elements that are
1121               // successfully loaded.
1122               if (element_index == 0) [[unlikely]] {
1123                 exception_raised_ = true;
1124               } else {
1125                 // TODO(b/323994286): Write a test case to verify vl changes correctly.
1126                 SetCsr<CsrName::kVl>(element_index);
1127               }
1128             } else {
1129               // Most load instructions set vstart to failing element which then may be processed
1130               // by exception handler.
1131               exception_raised_ = true;
1132               SetCsr<CsrName::kVstart>(element_index);
1133             }
1134             break;
1135           }
1136           result[field].template Set<ElementType>(static_cast<ElementType>(mem_access_result.value),
1137                                                   within_register_id);
1138         }
1139       }
1140       // Lambda to generate tail mask. We don't want to call MakeBitmaskFromVl eagerly because it's
1141       // not needed, most of the time, and compiler couldn't eliminate access to mmap-backed memory.
1142       auto GetTailMask = [vl, within_group_id] {
1143         return std::get<0>(intrinsics::MakeBitmaskFromVl<ElementType>(
1144             (vl <= within_group_id * kElementsCount) ? 0 : vl - within_group_id * kElementsCount));
1145       };
1146       // If mask has inactive elements and InactiveProcessing::kAgnostic mode is used then set them
1147       // to ~0.
1148       if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1149         if (register_mask != full_mask) {
1150           auto [simd_mask] =
1151               intrinsics::BitMaskToSimdMask<ElementType>(Int64{MaskType{register_mask}});
1152           for (size_t field = 0; field < kSegmentSize; ++field) {
1153             if constexpr (vma == InactiveProcessing::kAgnostic) {
1154               // vstart equal to zero is supposed to be exceptional. From RISV-V V manual (page 14):
1155               // The vstart CSR is writable by unprivileged code, but non-zero vstart values may
1156               // cause vector instructions to run substantially slower on some implementations, so
1157               // vstart should not be used by application programmers. A few vector instructions
1158               // cannot be executed with a non-zero vstart value and will raise an illegal
1159               // instruction exception as dened below.
1160               // TODO(b/300690740): decide whether to merge two cases after support for vectors in
1161               // heavy optimizer would be implemented.
1162               if (vstart) [[unlikely]] {
1163                 SIMD128Register vstart_mask = std::get<0>(
1164                     intrinsics::MakeBitmaskFromVl<ElementType>(vstart % kElementsCount));
1165                 if constexpr (vta == TailProcessing::kAgnostic) {
1166                   result[field] |= vstart_mask & ~simd_mask;
1167                 } else if (vl < (within_group_id + 1) * kElementsCount) {
1168                   result[field] |= vstart_mask & ~simd_mask & ~GetTailMask();
1169                 } else {
1170                   result[field] |= vstart_mask & ~simd_mask;
1171                 }
1172               } else if constexpr (vta == TailProcessing::kAgnostic) {
1173                 result[field] |= ~simd_mask;
1174               } else {
1175                 if (vl < (within_group_id + 1) * kElementsCount) {
1176                   result[field] |= ~simd_mask & ~GetTailMask();
1177                 } else {
1178                   result[field] |= ~simd_mask;
1179                 }
1180               }
1181             }
1182           }
1183         }
1184       }
1185       // If we have tail elements and TailProcessing::kAgnostic mode then set them to ~0.
1186       if constexpr (vta == TailProcessing::kAgnostic) {
1187         for (size_t field = 0; field < kSegmentSize; ++field) {
1188           if (vl < (within_group_id + 1) * kElementsCount) {
1189             result[field] |= GetTailMask();
1190           }
1191         }
1192       }
1193       // Put values back into register file.
1194       for (size_t field = 0; field < kSegmentSize; ++field) {
1195         state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup] =
1196             result[field].template Get<__uint128_t>();
1197       }
1198       // Next group should be fully processed.
1199       vstart = 0;
1200     }
1201   }
1202 
1203   // The vector register gather instructions read elements from src1 vector register group at
1204   // locations given by the second source vector src2 register group.
1205   //   src1: element vector register.
1206   //   GetElementIndex: universal lambda that returns index from src2,
1207   template <typename ElementType,
1208             VectorRegisterGroupMultiplier vlmul,
1209             TailProcessing vta,
1210             auto vma,
1211             typename GetElementIndexLambdaType>
OpVectorGather(uint8_t dst,uint8_t src1,GetElementIndexLambdaType GetElementIndex)1212   void OpVectorGather(uint8_t dst, uint8_t src1, GetElementIndexLambdaType GetElementIndex) {
1213     constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
1214     if (!IsAligned<kRegistersInvolved>(dst | src1)) {
1215       return Undefined();
1216     }
1217     // Source and destination must not overlap.
1218     if (dst < (src1 + kRegistersInvolved) && src1 < (dst + kRegistersInvolved)) {
1219       return Undefined();
1220     }
1221     constexpr size_t kElementsCount = 16 / sizeof(ElementType);
1222     constexpr size_t vlmax = GetVlmax<ElementType, vlmul>();
1223 
1224     size_t vstart = GetCsr<CsrName::kVstart>();
1225     size_t vl = GetCsr<CsrName::kVl>();
1226     auto mask = GetMaskForVectorOperations<vma>();
1227     SetCsr<CsrName::kVstart>(0);
1228     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
1229     // vector register group, including that no tail elements are updated with agnostic values.
1230     if (vstart >= vl) [[unlikely]] {
1231       return;
1232     }
1233 
1234     // Copy vlmul registers into array of elements, access elements of temporary array.
1235     alignas(alignof(SIMD128Register)) ElementType values[vlmax];
1236     memcpy(values, state_->cpu.v + src1, sizeof(values));
1237     // Fill dst first, resolve mask later.
1238     for (size_t index = vstart / kElementsCount; index < kRegistersInvolved; ++index) {
1239       SIMD128Register original_dst_value;
1240       SIMD128Register result{state_->cpu.v[dst + index]};
1241       for (size_t dst_element_index = vstart % kElementsCount; dst_element_index < kElementsCount;
1242            ++dst_element_index) {
1243         size_t src_element_index = GetElementIndex(index * kElementsCount + dst_element_index);
1244 
1245         // If an element index is out of range ( vs1[i] >= VLMAX ) then zero is returned for the
1246         // element value.
1247         ElementType element_value = ElementType{0};
1248         if (src_element_index < vlmax) {
1249           element_value = values[src_element_index];
1250         }
1251         original_dst_value.Set<ElementType>(element_value, dst_element_index);
1252       }
1253 
1254       // Apply mask and put result values into dst register.
1255       result =
1256           VectorMasking<ElementType, vta, vma>(result, original_dst_value, vstart, vl, index, mask);
1257       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
1258       // Next group should be fully processed.
1259       vstart = 0;
1260     }
1261   }
1262 
1263   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpFVfArgs & args,ElementType arg2)1264   void OpVector(const Decoder::VOpFVfArgs& args, ElementType arg2) {
1265     using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
1266     if constexpr (sizeof(ElementType) == sizeof(Float32)) {
1267       // Keep cases sorted in opcode order to match RISC-V V manual.
1268       switch (args.opcode) {
1269         case Decoder::VOpFVfOpcode::kVfwaddvf:
1270           return OpVectorWidenvx<intrinsics::Vfwaddvf<ElementType>,
1271                                  ElementType,
1272                                  vlmul,
1273                                  vta,
1274                                  vma,
1275                                  kFrm>(args.dst, args.src1, arg2);
1276         case Decoder::VOpFVfOpcode::kVfwsubvf:
1277           return OpVectorWidenvx<intrinsics::Vfwsubvf<ElementType>,
1278                                  ElementType,
1279                                  vlmul,
1280                                  vta,
1281                                  vma,
1282                                  kFrm>(args.dst, args.src1, arg2);
1283         case Decoder::VOpFVfOpcode::kVfwmulvf:
1284           return OpVectorWidenvx<intrinsics::Vfwmulvf<ElementType>,
1285                                  ElementType,
1286                                  vlmul,
1287                                  vta,
1288                                  vma,
1289                                  kFrm>(args.dst, args.src1, arg2);
1290         case Decoder::VOpFVfOpcode::kVfwaddwf:
1291           return OpVectorWidenwx<intrinsics::Vfwaddwf<ElementType>,
1292                                  ElementType,
1293                                  vlmul,
1294                                  vta,
1295                                  vma,
1296                                  kFrm>(args.dst, args.src1, arg2);
1297         case Decoder::VOpFVfOpcode::kVfwsubwf:
1298           return OpVectorWidenwx<intrinsics::Vfwsubwf<ElementType>,
1299                                  ElementType,
1300                                  vlmul,
1301                                  vta,
1302                                  vma,
1303                                  kFrm>(args.dst, args.src1, arg2);
1304         case Decoder::VOpFVfOpcode::kVfwmaccvf:
1305           return OpVectorWidenvxw<intrinsics::Vfwmaccvf<ElementType>,
1306                                   ElementType,
1307                                   vlmul,
1308                                   vta,
1309                                   vma,
1310                                   kFrm>(args.dst, args.src1, arg2);
1311         case Decoder::VOpFVfOpcode::kVfwnmaccvf:
1312           return OpVectorWidenvxw<intrinsics::Vfwnmaccvf<ElementType>,
1313                                   ElementType,
1314                                   vlmul,
1315                                   vta,
1316                                   vma,
1317                                   kFrm>(args.dst, args.src1, arg2);
1318         case Decoder::VOpFVfOpcode::kVfwmsacvf:
1319           return OpVectorWidenvxw<intrinsics::Vfwmsacvf<ElementType>,
1320                                   ElementType,
1321                                   vlmul,
1322                                   vta,
1323                                   vma,
1324                                   kFrm>(args.dst, args.src1, arg2);
1325         case Decoder::VOpFVfOpcode::kVfwnmsacvf:
1326           return OpVectorWidenvxw<intrinsics::Vfwnmsacvf<ElementType>,
1327                                   ElementType,
1328                                   vlmul,
1329                                   vta,
1330                                   vma,
1331                                   kFrm>(args.dst, args.src1, arg2);
1332         default:
1333           break;
1334       }
1335     }
1336     // Keep cases sorted in opcode order to match RISC-V V manual.
1337     switch (args.opcode) {
1338       case Decoder::VOpFVfOpcode::kVfminvf:
1339         return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>(
1340             args.dst, args.src1, arg2);
1341       case Decoder::VOpFVfOpcode::kVfmaxvf:
1342         return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>(
1343             args.dst, args.src1, arg2);
1344       case Decoder::VOpFVfOpcode::kVfsgnjvf:
1345         return OpVectorvx<intrinsics::Vfsgnjvx<ElementType>, ElementType, vlmul, vta, vma>(
1346             args.dst, args.src1, arg2);
1347       case Decoder::VOpFVfOpcode::kVfsgnjnvf:
1348         return OpVectorvx<intrinsics::Vfsgnjnvx<ElementType>, ElementType, vlmul, vta, vma>(
1349             args.dst, args.src1, arg2);
1350       case Decoder::VOpFVfOpcode::kVfsgnjxvf:
1351         return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>(
1352             args.dst, args.src1, arg2);
1353       case Decoder::VOpFVfOpcode::kVfslide1upvf:
1354         return OpVectorslide1up<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
1355       case Decoder::VOpFVfOpcode::kVfslide1downvf:
1356         return OpVectorslide1down<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
1357       case Decoder::VOpFVfOpcode::kVfmvsf:
1358         if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1359           return Undefined();
1360         }
1361         if (args.src1 != 0) {
1362           return Undefined();
1363         }
1364         return OpVectorVmvsx<ElementType, vta>(args.dst, arg2);
1365       case Decoder::VOpFVfOpcode::kVfmergevf:
1366         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1367           if (args.src1 != 0) {
1368             return Undefined();
1369           }
1370           return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(args.dst,
1371                                                                                           arg2);
1372         } else {
1373           return OpVectorx<intrinsics::Vcopyx<ElementType>,
1374                            ElementType,
1375                            vlmul,
1376                            vta,
1377                            // Always use "undisturbed" value from source register.
1378                            InactiveProcessing::kUndisturbed>(
1379               args.dst, arg2, /*dst_mask=*/args.src1);
1380         }
1381       case Decoder::VOpFVfOpcode::kVmfeqvf:
1382         return OpVectorToMaskvx<intrinsics::Vfeqvx<ElementType>, ElementType, vlmul, vma>(
1383             args.dst, args.src1, arg2);
1384       case Decoder::VOpFVfOpcode::kVmflevf:
1385         return OpVectorToMaskvx<intrinsics::Vflevx<ElementType>, ElementType, vlmul, vma>(
1386             args.dst, args.src1, arg2);
1387       case Decoder::VOpFVfOpcode::kVmfltvf:
1388         return OpVectorToMaskvx<intrinsics::Vfltvx<ElementType>, ElementType, vlmul, vma>(
1389             args.dst, args.src1, arg2);
1390       case Decoder::VOpFVfOpcode::kVmfnevf:
1391         return OpVectorToMaskvx<intrinsics::Vfnevx<ElementType>, ElementType, vlmul, vma>(
1392             args.dst, args.src1, arg2);
1393       case Decoder::VOpFVfOpcode::kVmfgtvf:
1394         return OpVectorToMaskvx<intrinsics::Vfgtvx<ElementType>, ElementType, vlmul, vma>(
1395             args.dst, args.src1, arg2);
1396       case Decoder::VOpFVfOpcode::kVmfgevf:
1397         return OpVectorToMaskvx<intrinsics::Vfgevx<ElementType>, ElementType, vlmul, vma>(
1398             args.dst, args.src1, arg2);
1399       case Decoder::VOpFVfOpcode::kVfdivvf:
1400         return OpVectorSameWidth<intrinsics::Vfdivvf<ElementType>,
1401                                  ElementType,
1402                                  NumberOfRegistersInvolved(vlmul),
1403                                  vta,
1404                                  vma,
1405                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1406       case Decoder::VOpFVfOpcode::kVfrdivvf:
1407         return OpVectorSameWidth<intrinsics::Vfrdivvf<ElementType>,
1408                                  ElementType,
1409                                  NumberOfRegistersInvolved(vlmul),
1410                                  vta,
1411                                  vma,
1412                                  kFrm>(
1413             args.dst,
1414             Vec<SignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x3f80'0000
1415                                                                     : 0x3ff0'0000'0000'0000}>{
1416                 args.src1},
1417             arg2);
1418       case Decoder::VOpFVfOpcode::kVfmulvf:
1419         return OpVectorSameWidth<intrinsics::Vfmulvf<ElementType>,
1420                                  ElementType,
1421                                  NumberOfRegistersInvolved(vlmul),
1422                                  vta,
1423                                  vma,
1424                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1425       case Decoder::VOpFVfOpcode::kVfaddvf:
1426         return OpVectorSameWidth<intrinsics::Vfaddvf<ElementType>,
1427                                  ElementType,
1428                                  NumberOfRegistersInvolved(vlmul),
1429                                  vta,
1430                                  vma,
1431                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1432       case Decoder::VOpFVfOpcode::kVfsubvf:
1433         return OpVectorSameWidth<intrinsics::Vfsubvf<ElementType>,
1434                                  ElementType,
1435                                  NumberOfRegistersInvolved(vlmul),
1436                                  vta,
1437                                  vma,
1438                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1439       case Decoder::VOpFVfOpcode::kVfrsubvf:
1440         return OpVectorSameWidth<intrinsics::Vfrsubvf<ElementType>,
1441                                  ElementType,
1442                                  NumberOfRegistersInvolved(vlmul),
1443                                  vta,
1444                                  vma,
1445                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1446       case Decoder::VOpFVfOpcode::kVfmaccvf:
1447         return OpVectorvxv<intrinsics::Vfmaccvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1448             args.dst, args.src1, arg2);
1449       case Decoder::VOpFVfOpcode::kVfmsacvf:
1450         return OpVectorvxv<intrinsics::Vfmsacvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1451             args.dst, args.src1, arg2);
1452       case Decoder::VOpFVfOpcode::kVfmaddvf:
1453         return OpVectorvxv<intrinsics::Vfmaddvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1454             args.dst, args.src1, arg2);
1455       case Decoder::VOpFVfOpcode::kVfmsubvf:
1456         return OpVectorvxv<intrinsics::Vfmsubvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1457             args.dst, args.src1, arg2);
1458       case Decoder::VOpFVfOpcode::kVfnmaccvf:
1459         return OpVectorvxv<intrinsics::Vfnmaccvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1460             args.dst, args.src1, arg2);
1461       case Decoder::VOpFVfOpcode::kVfnmsacvf:
1462         return OpVectorvxv<intrinsics::Vfnmsacvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1463             args.dst, args.src1, arg2);
1464       case Decoder::VOpFVfOpcode::kVfnmaddvf:
1465         return OpVectorvxv<intrinsics::Vfnmaddvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1466             args.dst, args.src1, arg2);
1467       case Decoder::VOpFVfOpcode::kVfnmsubvf:
1468         return OpVectorvxv<intrinsics::Vfnmsubvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1469             args.dst, args.src1, arg2);
1470       default:
1471         return Undefined();
1472     }
1473   }
1474 
1475   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpFVvArgs & args)1476   void OpVector(const Decoder::VOpFVvArgs& args) {
1477     using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
1478     using UnsignedType = Wrapping<std::make_unsigned_t<typename TypeTraits<ElementType>::Int>>;
1479     // We currently don't support Float16 operations, but conversion routines that deal with
1480     // double-width floats use these encodings to produce regular Float32 types.
1481     if constexpr (sizeof(ElementType) <= sizeof(Float32)) {
1482       using WideElementType = typename TypeTraits<ElementType>::Wide;
1483       // Keep cases sorted in opcode order to match RISC-V V manual.
1484       switch (args.opcode) {
1485         case Decoder::VOpFVvOpcode::kVFUnary0:
1486           switch (args.vfunary0_opcode) {
1487             case Decoder::VFUnary0Opcode::kVfwcvtfxuv:
1488               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1489                 return intrinsics::Vfcvtv<WideElementType, UnsignedType>(FPFlags::DYN, frm, src);
1490               },
1491                                     UnsignedType,
1492                                     vlmul,
1493                                     vta,
1494                                     vma,
1495                                     kFrm>(args.dst, args.src1);
1496             case Decoder::VFUnary0Opcode::kVfwcvtfxv:
1497               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1498                 return intrinsics::Vfcvtv<WideElementType, SignedType>(FPFlags::DYN, frm, src);
1499               },
1500                                     SignedType,
1501                                     vlmul,
1502                                     vta,
1503                                     vma,
1504                                     kFrm>(args.dst, args.src1);
1505             case Decoder::VFUnary0Opcode::kVfncvtxufw:
1506               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1507                 return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::DYN, frm, src);
1508               },
1509                                      UnsignedType,
1510                                      vlmul,
1511                                      vta,
1512                                      vma,
1513                                      kFrm>(args.dst, args.src1);
1514             case Decoder::VFUnary0Opcode::kVfncvtxfw:
1515               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1516                 return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::DYN, frm, src);
1517               },
1518                                      SignedType,
1519                                      vlmul,
1520                                      vta,
1521                                      vma,
1522                                      kFrm>(args.dst, args.src1);
1523             case Decoder::VFUnary0Opcode::kVfncvtrtzxufw:
1524               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1525                 return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::RTZ, frm, src);
1526               },
1527                                      UnsignedType,
1528                                      vlmul,
1529                                      vta,
1530                                      vma,
1531                                      kFrm>(args.dst, args.src1);
1532             case Decoder::VFUnary0Opcode::kVfncvtrtzxfw:
1533               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1534                 return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::RTZ, frm, src);
1535               },
1536                                      SignedType,
1537                                      vlmul,
1538                                      vta,
1539                                      vma,
1540                                      kFrm>(args.dst, args.src1);
1541             default:
1542               break;  // Make compiler happy.
1543           }
1544           break;
1545         default:
1546           break;  // Make compiler happy.
1547       }
1548     }
1549     // Widening and narrowing opeation which take floating point “narrow” operand may only work
1550     // correctly with Float32 input: Float16 is not supported yet, while Float64 input would produce
1551     // 128bit output which is currently reserver in RISC-V V.
1552     if constexpr (sizeof(ElementType) == sizeof(Float32)) {
1553       using WideElementType = WideType<ElementType>;
1554       using WideSignedType = WideType<SignedType>;
1555       using WideUnsignedType = WideType<UnsignedType>;
1556       // Keep cases sorted in opcode order to match RISC-V V manual.
1557       switch (args.opcode) {
1558         case Decoder::VOpFVvOpcode::kVfwaddvv:
1559           return OpVectorWidenvv<intrinsics::Vfwaddvv<ElementType>,
1560                                  ElementType,
1561                                  vlmul,
1562                                  vta,
1563                                  vma,
1564                                  kFrm>(args.dst, args.src1, args.src2);
1565         case Decoder::VOpFVvOpcode::kVfwsubvv:
1566           return OpVectorWidenvv<intrinsics::Vfwsubvv<ElementType>,
1567                                  ElementType,
1568                                  vlmul,
1569                                  vta,
1570                                  vma,
1571                                  kFrm>(args.dst, args.src1, args.src2);
1572         case Decoder::VOpFVvOpcode::kVfwmulvv:
1573           return OpVectorWidenvv<intrinsics::Vfwmulvv<ElementType>,
1574                                  ElementType,
1575                                  vlmul,
1576                                  vta,
1577                                  vma,
1578                                  kFrm>(args.dst, args.src1, args.src2);
1579         case Decoder::VOpFVvOpcode::kVfwaddwv:
1580           return OpVectorWidenwv<intrinsics::Vfwaddwv<ElementType>,
1581                                  ElementType,
1582                                  vlmul,
1583                                  vta,
1584                                  vma,
1585                                  kFrm>(args.dst, args.src1, args.src2);
1586         case Decoder::VOpFVvOpcode::kVfwsubwv:
1587           return OpVectorWidenwv<intrinsics::Vfwsubwv<ElementType>,
1588                                  ElementType,
1589                                  vlmul,
1590                                  vta,
1591                                  vma,
1592                                  kFrm>(args.dst, args.src1, args.src2);
1593         case Decoder::VOpFVvOpcode::kVfwmaccvv:
1594           return OpVectorWidenvvw<intrinsics::Vfwmaccvv<ElementType>,
1595                                   ElementType,
1596                                   vlmul,
1597                                   vta,
1598                                   vma,
1599                                   kFrm>(args.dst, args.src1, args.src2);
1600         case Decoder::VOpFVvOpcode::kVfwnmaccvv:
1601           return OpVectorWidenvvw<intrinsics::Vfwnmaccvv<ElementType>,
1602                                   ElementType,
1603                                   vlmul,
1604                                   vta,
1605                                   vma,
1606                                   kFrm>(args.dst, args.src1, args.src2);
1607         case Decoder::VOpFVvOpcode::kVfwmsacvv:
1608           return OpVectorWidenvvw<intrinsics::Vfwmsacvv<ElementType>,
1609                                   ElementType,
1610                                   vlmul,
1611                                   vta,
1612                                   vma,
1613                                   kFrm>(args.dst, args.src1, args.src2);
1614         case Decoder::VOpFVvOpcode::kVfwnmsacvv:
1615           return OpVectorWidenvvw<intrinsics::Vfwnmsacvv<ElementType>,
1616                                   ElementType,
1617                                   vlmul,
1618                                   vta,
1619                                   vma,
1620                                   kFrm>(args.dst, args.src1, args.src2);
1621         case Decoder::VOpFVvOpcode::kVFUnary0:
1622           switch (args.vfunary0_opcode) {
1623             case Decoder::VFUnary0Opcode::kVfwcvtxufv:
1624               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1625                 return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::DYN, frm, src);
1626               },
1627                                     ElementType,
1628                                     vlmul,
1629                                     vta,
1630                                     vma,
1631                                     kFrm>(args.dst, args.src1);
1632             case Decoder::VFUnary0Opcode::kVfwcvtxfv:
1633               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1634                 return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::DYN, frm, src);
1635               },
1636                                     ElementType,
1637                                     vlmul,
1638                                     vta,
1639                                     vma,
1640                                     kFrm>(args.dst, args.src1);
1641             case Decoder::VFUnary0Opcode::kVfwcvtffv:
1642               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1643                 return intrinsics::Vfcvtv<WideElementType, ElementType>(FPFlags::DYN, frm, src);
1644               },
1645                                     ElementType,
1646                                     vlmul,
1647                                     vta,
1648                                     vma,
1649                                     kFrm>(args.dst, args.src1);
1650             case Decoder::VFUnary0Opcode::kVfwcvtrtzxufv:
1651               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1652                 return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::RTZ, frm, src);
1653               },
1654                                     ElementType,
1655                                     vlmul,
1656                                     vta,
1657                                     vma,
1658                                     kFrm>(args.dst, args.src1);
1659             case Decoder::VFUnary0Opcode::kVfwcvtrtzxfv:
1660               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1661                 return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::RTZ, frm, src);
1662               },
1663                                     ElementType,
1664                                     vlmul,
1665                                     vta,
1666                                     vma,
1667                                     kFrm>(args.dst, args.src1);
1668             case Decoder::VFUnary0Opcode::kVfncvtfxuw:
1669               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1670                 return intrinsics::Vfcvtv<ElementType, WideUnsignedType>(FPFlags::DYN, frm, src);
1671               },
1672                                      ElementType,
1673                                      vlmul,
1674                                      vta,
1675                                      vma,
1676                                      kFrm>(args.dst, args.src1);
1677             case Decoder::VFUnary0Opcode::kVfncvtffw:
1678               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1679                 return intrinsics::Vfcvtv<ElementType, WideElementType>(FPFlags::DYN, frm, src);
1680               },
1681                                      ElementType,
1682                                      vlmul,
1683                                      vta,
1684                                      vma,
1685                                      kFrm>(args.dst, args.src1);
1686             case Decoder::VFUnary0Opcode::kVfncvtfxw:
1687               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1688                 return intrinsics::Vfcvtv<ElementType, WideSignedType>(FPFlags::DYN, frm, src);
1689               },
1690                                      ElementType,
1691                                      vlmul,
1692                                      vta,
1693                                      vma,
1694                                      kFrm>(args.dst, args.src1);
1695             default:
1696               break;  // Make compiler happy.
1697           }
1698           break;
1699         default:
1700           break;  // Make compiler happy.
1701       }
1702     }
1703     // If our ElementType is Float16 then “straight” operations are unsupported and we whouldn't try
1704     // instantiate any functions since this would lead to compilke-time error.
1705     if constexpr (sizeof(ElementType) >= sizeof(Float32)) {
1706       // Floating point IEEE 754 value -0.0 includes 1 top bit set and the other bits not set:
1707       // https://en.wikipedia.org/wiki/Signed_zero#Representations This is the exact same
1708       // representation minimum negative integer have in two's complement representation:
1709       // https://en.wikipedia.org/wiki/Two%27s_complement#Most_negative_number
1710       // Note: we pass filler elements as integers because `Float32`/`Float64` couldn't be template
1711       // parameters.
1712       constexpr SignedType kNegativeZero{std::numeric_limits<typename SignedType::BaseType>::min()};
1713       // Floating point IEEE 754 value +0.0 includes only zero bits, same as integer zero.
1714       constexpr SignedType kPositiveZero{};
1715       // Keep cases sorted in opcode order to match RISC-V V manual.
1716       switch (args.opcode) {
1717         case Decoder::VOpFVvOpcode::kVfredusumvs:
1718           // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1719           // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
1720           if (GetCsr<kFrm>() != FPFlags::RDN) {
1721             return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
1722                               ElementType,
1723                               vlmul,
1724                               vta,
1725                               vma,
1726                               kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1727           } else {
1728             return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
1729                               ElementType,
1730                               vlmul,
1731                               vta,
1732                               vma,
1733                               kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1734           }
1735         case Decoder::VOpFVvOpcode::kVfredosumvs:
1736           // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1737           // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
1738           if (GetCsr<kFrm>() != FPFlags::RDN) {
1739             return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
1740                               ElementType,
1741                               vlmul,
1742                               vta,
1743                               vma,
1744                               kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1745           } else {
1746             return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
1747                               ElementType,
1748                               vlmul,
1749                               vta,
1750                               vma,
1751                               kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1752           }
1753         case Decoder::VOpFVvOpcode::kVfminvv:
1754           return OpVectorvv<intrinsics::Vfminvv<ElementType>, ElementType, vlmul, vta, vma>(
1755               args.dst, args.src1, args.src2);
1756         case Decoder::VOpFVvOpcode::kVfredminvs:
1757           // For Vfredmin the identity element is +inf.
1758           return OpVectorvs<intrinsics::Vfredminvs<ElementType>, ElementType, vlmul, vta, vma>(
1759               args.dst,
1760               Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x7f80'0000
1761                                                                         : 0x7ff0'0000'0000'0000}>{
1762                   args.src1},
1763               args.src2);
1764         case Decoder::VOpFVvOpcode::kVfmaxvv:
1765           return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>(
1766               args.dst, args.src1, args.src2);
1767         case Decoder::VOpFVvOpcode::kVfredmaxvs:
1768           // For Vfredmax the identity element is -inf.
1769           return OpVectorvs<intrinsics::Vfredmaxvs<ElementType>, ElementType, vlmul, vta, vma>(
1770               args.dst,
1771               Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0xff80'0000
1772                                                                         : 0xfff0'0000'0000'0000}>{
1773                   args.src1},
1774               args.src2);
1775         case Decoder::VOpFVvOpcode::kVfsgnjvv:
1776           return OpVectorvv<intrinsics::Vfsgnjvv<ElementType>, ElementType, vlmul, vta, vma>(
1777               args.dst, args.src1, args.src2);
1778         case Decoder::VOpFVvOpcode::kVfsgnjnvv:
1779           return OpVectorvv<intrinsics::Vfsgnjnvv<ElementType>, ElementType, vlmul, vta, vma>(
1780               args.dst, args.src1, args.src2);
1781         case Decoder::VOpFVvOpcode::kVfsgnjxvv:
1782           return OpVectorvv<intrinsics::Vfsgnjxvv<ElementType>, ElementType, vlmul, vta, vma>(
1783               args.dst, args.src1, args.src2);
1784         case Decoder::VOpFVvOpcode::kVFUnary0:
1785           switch (args.vfunary0_opcode) {
1786             case Decoder::VFUnary0Opcode::kVfcvtxufv:
1787               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1788                 return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::DYN, frm, src);
1789               },
1790                                ElementType,
1791                                vlmul,
1792                                vta,
1793                                vma,
1794                                kFrm>(args.dst, args.src1);
1795             case Decoder::VFUnary0Opcode::kVfcvtxfv:
1796               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1797                 return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::DYN, frm, src);
1798               },
1799                                ElementType,
1800                                vlmul,
1801                                vta,
1802                                vma,
1803                                kFrm>(args.dst, args.src1);
1804             case Decoder::VFUnary0Opcode::kVfcvtfxuv:
1805               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1806                 return intrinsics::Vfcvtv<ElementType, UnsignedType>(FPFlags::DYN, frm, src);
1807               },
1808                                UnsignedType,
1809                                vlmul,
1810                                vta,
1811                                vma,
1812                                kFrm>(args.dst, args.src1);
1813             case Decoder::VFUnary0Opcode::kVfcvtfxv:
1814               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1815                 return intrinsics::Vfcvtv<ElementType, SignedType>(FPFlags::DYN, frm, src);
1816               },
1817                                SignedType,
1818                                vlmul,
1819                                vta,
1820                                vma,
1821                                kFrm>(args.dst, args.src1);
1822             case Decoder::VFUnary0Opcode::kVfcvtrtzxufv:
1823               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1824                 return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::RTZ, frm, src);
1825               },
1826                                ElementType,
1827                                vlmul,
1828                                vta,
1829                                vma,
1830                                kFrm>(args.dst, args.src1);
1831             case Decoder::VFUnary0Opcode::kVfcvtrtzxfv:
1832               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1833                 return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::RTZ, frm, src);
1834               },
1835                                ElementType,
1836                                vlmul,
1837                                vta,
1838                                vma,
1839                                kFrm>(args.dst, args.src1);
1840             default:
1841               break;  // Make compiler happy.
1842           }
1843           break;
1844         case Decoder::VOpFVvOpcode::kVFUnary1:
1845           switch (args.vfunary1_opcode) {
1846             case Decoder::VFUnary1Opcode::kVfsqrtv:
1847               return OpVectorv<intrinsics::Vfsqrtv<ElementType>,
1848                                ElementType,
1849                                vlmul,
1850                                vta,
1851                                vma,
1852                                kFrm>(args.dst, args.src1);
1853               break;
1854             case Decoder::VFUnary1Opcode::kVfrsqrt7v:
1855               return OpVectorv<intrinsics::Vfrsqrt7v<ElementType>, ElementType, vlmul, vta, vma>(
1856                   args.dst, args.src1);
1857               break;
1858             case Decoder::VFUnary1Opcode::kVfclassv:
1859               return OpVectorv<intrinsics::Vfclassv<ElementType>, ElementType, vlmul, vta, vma>(
1860                   args.dst, args.src1);
1861               break;
1862             default:
1863               break;  // Make compiler happy.
1864           }
1865           break;
1866         case Decoder::VOpFVvOpcode::kVfmvfs:
1867           if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1868             return Undefined();
1869           }
1870           if (args.src2 != 0) {
1871             return Undefined();
1872           }
1873           return OpVectorVmvfs<ElementType>(args.dst, args.src1);
1874         case Decoder::VOpFVvOpcode::kVmfeqvv:
1875           return OpVectorToMaskvv<intrinsics::Vfeqvv<ElementType>, ElementType, vlmul, vma>(
1876               args.dst, args.src1, args.src2);
1877         case Decoder::VOpFVvOpcode::kVmflevv:
1878           return OpVectorToMaskvv<intrinsics::Vflevv<ElementType>, ElementType, vlmul, vma>(
1879               args.dst, args.src1, args.src2);
1880         case Decoder::VOpFVvOpcode::kVmfltvv:
1881           return OpVectorToMaskvv<intrinsics::Vfltvv<ElementType>, ElementType, vlmul, vma>(
1882               args.dst, args.src1, args.src2);
1883         case Decoder::VOpFVvOpcode::kVmfnevv:
1884           return OpVectorToMaskvv<intrinsics::Vfnevv<ElementType>, ElementType, vlmul, vma>(
1885               args.dst, args.src1, args.src2);
1886         case Decoder::VOpFVvOpcode::kVfdivvv:
1887           return OpVectorSameWidth<intrinsics::Vfdivvv<ElementType>,
1888                                    ElementType,
1889                                    NumberOfRegistersInvolved(vlmul),
1890                                    vta,
1891                                    vma,
1892                                    kFrm>(
1893               args.dst,
1894               Vec<SignedType{}>{args.src1},
1895               Vec<SignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x3f80'0000
1896                                                                       : 0x3ff0'0000'0000'0000}>{
1897                   args.src2});
1898         case Decoder::VOpFVvOpcode::kVfmulvv:
1899           return OpVectorSameWidth<intrinsics::Vfmulvv<ElementType>,
1900                                    ElementType,
1901                                    NumberOfRegistersInvolved(vlmul),
1902                                    vta,
1903                                    vma,
1904                                    kFrm>(
1905               args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
1906         case Decoder::VOpFVvOpcode::kVfaddvv:
1907           return OpVectorSameWidth<intrinsics::Vfaddvv<ElementType>,
1908                                    ElementType,
1909                                    NumberOfRegistersInvolved(vlmul),
1910                                    vta,
1911                                    vma,
1912                                    kFrm>(
1913               args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
1914         case Decoder::VOpFVvOpcode::kVfsubvv:
1915           return OpVectorSameWidth<intrinsics::Vfsubvv<ElementType>,
1916                                    ElementType,
1917                                    NumberOfRegistersInvolved(vlmul),
1918                                    vta,
1919                                    vma,
1920                                    kFrm>(
1921               args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
1922         case Decoder::VOpFVvOpcode::kVfmaccvv:
1923           return OpVectorvvv<intrinsics::Vfmaccvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1924               args.dst, args.src1, args.src2);
1925         case Decoder::VOpFVvOpcode::kVfmsacvv:
1926           return OpVectorvvv<intrinsics::Vfmsacvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1927               args.dst, args.src1, args.src2);
1928         case Decoder::VOpFVvOpcode::kVfmaddvv:
1929           return OpVectorvvv<intrinsics::Vfmaddvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1930               args.dst, args.src1, args.src2);
1931         case Decoder::VOpFVvOpcode::kVfmsubvv:
1932           return OpVectorvvv<intrinsics::Vfmsubvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1933               args.dst, args.src1, args.src2);
1934         case Decoder::VOpFVvOpcode::kVfnmaccvv:
1935           return OpVectorvvv<intrinsics::Vfnmaccvv<ElementType>,
1936                              ElementType,
1937                              vlmul,
1938                              vta,
1939                              vma,
1940                              kFrm>(args.dst, args.src1, args.src2);
1941         case Decoder::VOpFVvOpcode::kVfnmsacvv:
1942           return OpVectorvvv<intrinsics::Vfnmsacvv<ElementType>,
1943                              ElementType,
1944                              vlmul,
1945                              vta,
1946                              vma,
1947                              kFrm>(args.dst, args.src1, args.src2);
1948         case Decoder::VOpFVvOpcode::kVfnmaddvv:
1949           return OpVectorvvv<intrinsics::Vfnmaddvv<ElementType>,
1950                              ElementType,
1951                              vlmul,
1952                              vta,
1953                              vma,
1954                              kFrm>(args.dst, args.src1, args.src2);
1955         case Decoder::VOpFVvOpcode::kVfnmsubvv:
1956           return OpVectorvvv<intrinsics::Vfnmsubvv<ElementType>,
1957                              ElementType,
1958                              vlmul,
1959                              vta,
1960                              vma,
1961                              kFrm>(args.dst, args.src1, args.src2);
1962         default:
1963           break;  // Make compiler happy.
1964       }
1965     }
1966     return Undefined();
1967   }
1968 
1969   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIViArgs & args)1970   void OpVector(const Decoder::VOpIViArgs& args) {
1971     using SignedType = berberis::SignedType<ElementType>;
1972     using UnsignedType = berberis::UnsignedType<ElementType>;
1973     using SaturatingSignedType = SaturatingType<SignedType>;
1974     using SaturatingUnsignedType = SaturatingType<UnsignedType>;
1975     // Keep cases sorted in opcode order to match RISC-V V manual.
1976     switch (args.opcode) {
1977       case Decoder::VOpIViOpcode::kVaddvi:
1978         return OpVectorvx<intrinsics::Vaddvx<SignedType>, SignedType, vlmul, vta, vma>(
1979             args.dst, args.src, SignedType{args.imm});
1980       case Decoder::VOpIViOpcode::kVrsubvi:
1981         return OpVectorvx<intrinsics::Vrsubvx<SignedType>, SignedType, vlmul, vta, vma>(
1982             args.dst, args.src, SignedType{args.imm});
1983       case Decoder::VOpIViOpcode::kVandvi:
1984         return OpVectorvx<intrinsics::Vandvx<SignedType>, SignedType, vlmul, vta, vma>(
1985             args.dst, args.src, SignedType{args.imm});
1986       case Decoder::VOpIViOpcode::kVorvi:
1987         return OpVectorvx<intrinsics::Vorvx<SignedType>, SignedType, vlmul, vta, vma>(
1988             args.dst, args.src, SignedType{args.imm});
1989       case Decoder::VOpIViOpcode::kVxorvi:
1990         return OpVectorvx<intrinsics::Vxorvx<SignedType>, SignedType, vlmul, vta, vma>(
1991             args.dst, args.src, SignedType{args.imm});
1992       case Decoder::VOpIViOpcode::kVrgathervi:
1993         return OpVectorGather<ElementType, vlmul, vta, vma>(
1994             args.dst, args.src, [&args](size_t /*index*/) { return ElementType{args.uimm}; });
1995       case Decoder::VOpIViOpcode::kVmseqvi:
1996         return OpVectorToMaskvx<intrinsics::Vseqvx<SignedType>, SignedType, vlmul, vma>(
1997             args.dst, args.src, SignedType{args.imm});
1998       case Decoder::VOpIViOpcode::kVmsnevi:
1999         return OpVectorToMaskvx<intrinsics::Vsnevx<SignedType>, SignedType, vlmul, vma>(
2000             args.dst, args.src, SignedType{args.imm});
2001       case Decoder::VOpIViOpcode::kVmsleuvi:
2002         // Note: Vmsleu.vi actually have signed immediate which means that we first need to
2003         // expand it to the width of element as signed value and then bit-cast to unsigned.
2004         return OpVectorToMaskvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
2005             args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
2006       case Decoder::VOpIViOpcode::kVmslevi:
2007         return OpVectorToMaskvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
2008             args.dst, args.src, SignedType{args.imm});
2009       case Decoder::VOpIViOpcode::kVmsgtuvi:
2010         // Note: Vmsleu.vi actually have signed immediate which means that we first need to
2011         // expand it to the width of element as signed value and then bit-cast to unsigned.
2012         return OpVectorToMaskvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
2013             args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
2014       case Decoder::VOpIViOpcode::kVmsgtvi:
2015         return OpVectorToMaskvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
2016             args.dst, args.src, SignedType{args.imm});
2017       case Decoder::VOpIViOpcode::kVsadduvi:
2018         // Note: Vsaddu.vi actually have signed immediate which means that we first need to
2019         // expand it to the width of element as signed value and then bit-cast to unsigned.
2020         return OpVectorvx<intrinsics::Vaddvx<SaturatingUnsignedType>,
2021                           SaturatingUnsignedType,
2022                           vlmul,
2023                           vta,
2024                           vma>(
2025             args.dst, args.src, BitCastToUnsigned(SaturatingSignedType{args.imm}));
2026       case Decoder::VOpIViOpcode::kVsaddvi:
2027         return OpVectorvx<intrinsics::Vaddvx<SaturatingSignedType>,
2028                           SaturatingSignedType,
2029                           vlmul,
2030                           vta,
2031                           vma>(args.dst, args.src, SaturatingSignedType{args.imm});
2032       case Decoder::VOpIViOpcode::kVsllvi:
2033         return OpVectorvx<intrinsics::Vslvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2034             args.dst, args.src, UnsignedType{args.uimm});
2035       case Decoder::VOpIViOpcode::kVsrlvi:
2036         return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2037             args.dst, args.src, UnsignedType{args.uimm});
2038       case Decoder::VOpIViOpcode::kVsravi:
2039         // We need to pass shift value here as signed type but uimm value is always positive
2040         // and always fits into any integer.
2041         return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
2042             args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2043       case Decoder::VOpIViOpcode::kVmergevi:
2044         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2045           if (args.src != 0) {
2046             return Undefined();
2047           }
2048           return OpVectorx<intrinsics::Vcopyx<SignedType>, SignedType, vlmul, vta, vma>(
2049               args.dst, SignedType{args.imm});
2050         } else {
2051           return OpVectorx<intrinsics::Vcopyx<SignedType>,
2052                            SignedType,
2053                            vlmul,
2054                            vta,
2055                            // Always use "undisturbed" value from source register.
2056                            InactiveProcessing::kUndisturbed>(
2057               args.dst, SignedType{args.imm}, /*dst_mask=*/args.src);
2058         }
2059       case Decoder::VOpIViOpcode::kVmvXrv:
2060         // kVmv<nr>rv instruction
2061         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2062           switch (args.imm) {
2063             case 0:
2064               return OpVectorVmvXrv<ElementType, 1>(args.dst, args.src);
2065             case 1:
2066               return OpVectorVmvXrv<ElementType, 2>(args.dst, args.src);
2067             case 3:
2068               return OpVectorVmvXrv<ElementType, 4>(args.dst, args.src);
2069             case 7:
2070               return OpVectorVmvXrv<ElementType, 8>(args.dst, args.src);
2071             default:
2072               return Undefined();
2073           }
2074         } else {
2075           return Undefined();
2076         }
2077       case Decoder::VOpIViOpcode::kVnsrawi:
2078         // We need to pass shift value here as signed type but uimm value is always positive
2079         // and always fits into any integer.
2080         return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
2081             args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2082       case Decoder::VOpIViOpcode::kVnsrlwi:
2083         return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2084             args.dst, args.src, UnsignedType{args.uimm});
2085       case Decoder::VOpIViOpcode::kVslideupvi:
2086         return OpVectorslideup<UnsignedType, vlmul, vta, vma>(
2087             args.dst, args.src, UnsignedType{args.uimm});
2088       case Decoder::VOpIViOpcode::kVslidedownvi:
2089         return OpVectorslidedown<UnsignedType, vlmul, vta, vma>(
2090             args.dst, args.src, UnsignedType{args.uimm});
2091       case Decoder::VOpIViOpcode::kVnclipuwi:
2092         return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>,
2093                                 SaturatingUnsignedType,
2094                                 vlmul,
2095                                 vta,
2096                                 vma,
2097                                 kVxrm>(args.dst, args.src, UnsignedType{args.uimm});
2098       case Decoder::VOpIViOpcode::kVnclipwi:
2099         return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>,
2100                                 SaturatingSignedType,
2101                                 vlmul,
2102                                 vta,
2103                                 vma,
2104                                 kVxrm>(args.dst, args.src, UnsignedType{args.uimm});
2105       case Decoder::VOpIViOpcode::kVssrlvi:
2106         return OpVectorvx<intrinsics::Vssrvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2107             args.dst, args.src, UnsignedType{args.uimm});
2108       case Decoder::VOpIViOpcode::kVssravi:
2109         return OpVectorvx<intrinsics::Vssrvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2110             args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2111       default:
2112         Undefined();
2113     }
2114   }
2115 
2116   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIVvArgs & args)2117   void OpVector(const Decoder::VOpIVvArgs& args) {
2118     using SignedType = berberis::SignedType<ElementType>;
2119     using UnsignedType = berberis::UnsignedType<ElementType>;
2120     using SaturatingSignedType = SaturatingType<SignedType>;
2121     using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2122     // Keep cases sorted in opcode order to match RISC-V V manual.
2123     switch (args.opcode) {
2124       case Decoder::VOpIVvOpcode::kVaddvv:
2125         return OpVectorvv<intrinsics::Vaddvv<ElementType>, ElementType, vlmul, vta, vma>(
2126             args.dst, args.src1, args.src2);
2127       case Decoder::VOpIVvOpcode::kVsubvv:
2128         return OpVectorvv<intrinsics::Vsubvv<ElementType>, ElementType, vlmul, vta, vma>(
2129             args.dst, args.src1, args.src2);
2130       case Decoder::VOpIVvOpcode::kVandvv:
2131         return OpVectorvv<intrinsics::Vandvv<ElementType>, ElementType, vlmul, vta, vma>(
2132             args.dst, args.src1, args.src2);
2133       case Decoder::VOpIVvOpcode::kVorvv:
2134         return OpVectorvv<intrinsics::Vorvv<ElementType>, ElementType, vlmul, vta, vma>(
2135             args.dst, args.src1, args.src2);
2136       case Decoder::VOpIVvOpcode::kVxorvv:
2137         return OpVectorvv<intrinsics::Vxorvv<ElementType>, ElementType, vlmul, vta, vma>(
2138             args.dst, args.src1, args.src2);
2139       case Decoder::VOpIVvOpcode::kVrgathervv: {
2140         constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
2141         if (!IsAligned<kRegistersInvolved>(args.src2)) {
2142           return Undefined();
2143         }
2144         constexpr size_t vlmax = GetVlmax<ElementType, vlmul>();
2145         alignas(alignof(SIMD128Register)) ElementType indexes[vlmax];
2146         memcpy(indexes, state_->cpu.v + args.src2, sizeof(indexes));
2147         return OpVectorGather<ElementType, vlmul, vta, vma>(
2148             args.dst, args.src1, [&indexes](size_t index) { return indexes[index]; });
2149       }
2150       case Decoder::VOpIVvOpcode::kVmseqvv:
2151         return OpVectorToMaskvv<intrinsics::Vseqvv<ElementType>, ElementType, vlmul, vma>(
2152             args.dst, args.src1, args.src2);
2153       case Decoder::VOpIVvOpcode::kVmsnevv:
2154         return OpVectorToMaskvv<intrinsics::Vsnevv<ElementType>, ElementType, vlmul, vma>(
2155             args.dst, args.src1, args.src2);
2156       case Decoder::VOpIVvOpcode::kVmsltuvv:
2157         return OpVectorToMaskvv<intrinsics::Vsltvv<UnsignedType>, ElementType, vlmul, vma>(
2158             args.dst, args.src1, args.src2);
2159       case Decoder::VOpIVvOpcode::kVmsltvv:
2160         return OpVectorToMaskvv<intrinsics::Vsltvv<SignedType>, ElementType, vlmul, vma>(
2161             args.dst, args.src1, args.src2);
2162       case Decoder::VOpIVvOpcode::kVmsleuvv:
2163         return OpVectorToMaskvv<intrinsics::Vslevv<UnsignedType>, ElementType, vlmul, vma>(
2164             args.dst, args.src1, args.src2);
2165       case Decoder::VOpIVvOpcode::kVmslevv:
2166         return OpVectorToMaskvv<intrinsics::Vslevv<SignedType>, ElementType, vlmul, vma>(
2167             args.dst, args.src1, args.src2);
2168       case Decoder::VOpIVvOpcode::kVsadduvv:
2169         return OpVectorvv<intrinsics::Vaddvv<SaturatingUnsignedType>,
2170                           SaturatingUnsignedType,
2171                           vlmul,
2172                           vta,
2173                           vma>(args.dst, args.src1, args.src2);
2174       case Decoder::VOpIVvOpcode::kVsaddvv:
2175         return OpVectorvv<intrinsics::Vaddvv<SaturatingSignedType>,
2176                           SaturatingSignedType,
2177                           vlmul,
2178                           vta,
2179                           vma>(args.dst, args.src1, args.src2);
2180       case Decoder::VOpIVvOpcode::kVssubuvv:
2181         return OpVectorvv<intrinsics::Vsubvv<SaturatingUnsignedType>,
2182                           SaturatingUnsignedType,
2183                           vlmul,
2184                           vta,
2185                           vma>(args.dst, args.src1, args.src2);
2186       case Decoder::VOpIVvOpcode::kVssubvv:
2187         return OpVectorvv<intrinsics::Vsubvv<SaturatingSignedType>,
2188                           SaturatingSignedType,
2189                           vlmul,
2190                           vta,
2191                           vma>(args.dst, args.src1, args.src2);
2192       case Decoder::VOpIVvOpcode::kVsllvv:
2193         return OpVectorvv<intrinsics::Vslvv<ElementType>, ElementType, vlmul, vta, vma>(
2194             args.dst, args.src1, args.src2);
2195       case Decoder::VOpIVvOpcode::kVsrlvv:
2196         return OpVectorvv<intrinsics::Vsrvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2197             args.dst, args.src1, args.src2);
2198       case Decoder::VOpIVvOpcode::kVsravv:
2199         return OpVectorvv<intrinsics::Vsrvv<SignedType>, ElementType, vlmul, vta, vma>(
2200             args.dst, args.src1, args.src2);
2201       case Decoder::VOpIVvOpcode::kVminuvv:
2202         return OpVectorvv<intrinsics::Vminvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2203             args.dst, args.src1, args.src2);
2204       case Decoder::VOpIVvOpcode::kVminvv:
2205         return OpVectorvv<intrinsics::Vminvv<SignedType>, ElementType, vlmul, vta, vma>(
2206             args.dst, args.src1, args.src2);
2207       case Decoder::VOpIVvOpcode::kVmaxuvv:
2208         return OpVectorvv<intrinsics::Vmaxvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2209             args.dst, args.src1, args.src2);
2210       case Decoder::VOpIVvOpcode::kVmaxvv:
2211         return OpVectorvv<intrinsics::Vmaxvv<SignedType>, ElementType, vlmul, vta, vma>(
2212             args.dst, args.src1, args.src2);
2213       case Decoder::VOpIVvOpcode::kVmergevv:
2214         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2215           if (args.src1 != 0) {
2216             return Undefined();
2217           }
2218           return OpVectorv<intrinsics::Vcopyv<ElementType>, ElementType, vlmul, vta, vma>(
2219               args.dst, args.src2);
2220         } else {
2221           return OpVectorv<intrinsics::Vcopyv<ElementType>,
2222                            ElementType,
2223                            vlmul,
2224                            vta,
2225                            // Always use "undisturbed" value from source register.
2226                            InactiveProcessing::kUndisturbed>(
2227               args.dst, args.src2, /*dst_mask=*/args.src1);
2228         }
2229       case Decoder::VOpIVvOpcode::kVnsrawv:
2230         return OpVectorNarrowwv<intrinsics::Vnsrwv<SignedType>, SignedType, vlmul, vta, vma>(
2231             args.dst, args.src1, args.src2);
2232       case Decoder::VOpIVvOpcode::kVnsrlwv:
2233         return OpVectorNarrowwv<intrinsics::Vnsrwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2234             args.dst, args.src1, args.src2);
2235       case Decoder::VOpIVvOpcode::kVsmulvv:
2236         return OpVectorvv<intrinsics::Vsmulvv<SaturatingSignedType>,
2237                           ElementType,
2238                           vlmul,
2239                           vta,
2240                           vma,
2241                           kVxrm>(args.dst, args.src1, args.src2);
2242       case Decoder::VOpIVvOpcode::kVssrlvv:
2243         return OpVectorvv<intrinsics::Vssrvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2244             args.dst, args.src1, args.src2);
2245       case Decoder::VOpIVvOpcode::kVssravv:
2246         return OpVectorvv<intrinsics::Vssrvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2247             args.dst, args.src1, args.src2);
2248       case Decoder::VOpIVvOpcode::kVnclipuwv:
2249         return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingUnsignedType>,
2250                                 SaturatingUnsignedType,
2251                                 vlmul,
2252                                 vta,
2253                                 vma,
2254                                 kVxrm>(args.dst, args.src1, args.src2);
2255       case Decoder::VOpIVvOpcode::kVnclipwv:
2256         return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingSignedType>,
2257                                 SaturatingSignedType,
2258                                 vlmul,
2259                                 vta,
2260                                 vma,
2261                                 kVxrm>(args.dst, args.src1, args.src2);
2262       default:
2263         Undefined();
2264     }
2265   }
2266 
2267   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIVxArgs & args,Register arg2)2268   void OpVector(const Decoder::VOpIVxArgs& args, Register arg2) {
2269     using SignedType = berberis::SignedType<ElementType>;
2270     using UnsignedType = berberis::UnsignedType<ElementType>;
2271     using SaturatingSignedType = SaturatingType<SignedType>;
2272     using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2273     // Keep cases sorted in opcode order to match RISC-V V manual.
2274     switch (args.opcode) {
2275       case Decoder::VOpIVxOpcode::kVaddvx:
2276         return OpVectorvx<intrinsics::Vaddvx<ElementType>, ElementType, vlmul, vta, vma>(
2277             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2278       case Decoder::VOpIVxOpcode::kVsubvx:
2279         return OpVectorvx<intrinsics::Vsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2280             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2281       case Decoder::VOpIVxOpcode::kVrsubvx:
2282         return OpVectorvx<intrinsics::Vrsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2283             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2284       case Decoder::VOpIVxOpcode::kVandvx:
2285         return OpVectorvx<intrinsics::Vandvx<ElementType>, ElementType, vlmul, vta, vma>(
2286             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2287       case Decoder::VOpIVxOpcode::kVorvx:
2288         return OpVectorvx<intrinsics::Vorvx<ElementType>, ElementType, vlmul, vta, vma>(
2289             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2290       case Decoder::VOpIVxOpcode::kVxorvx:
2291         return OpVectorvx<intrinsics::Vxorvx<ElementType>, ElementType, vlmul, vta, vma>(
2292             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2293       case Decoder::VOpIVxOpcode::kVrgathervx:
2294         return OpVectorGather<ElementType, vlmul, vta, vma>(
2295             args.dst, args.src1, [&arg2](size_t /*index*/) {
2296               return MaybeTruncateTo<ElementType>(arg2);
2297             });
2298       case Decoder::VOpIVxOpcode::kVmseqvx:
2299         return OpVectorToMaskvx<intrinsics::Vseqvx<ElementType>, ElementType, vlmul, vma>(
2300             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2301       case Decoder::VOpIVxOpcode::kVmsnevx:
2302         return OpVectorToMaskvx<intrinsics::Vsnevx<ElementType>, ElementType, vlmul, vma>(
2303             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2304       case Decoder::VOpIVxOpcode::kVmsltuvx:
2305         return OpVectorToMaskvx<intrinsics::Vsltvx<UnsignedType>, UnsignedType, vlmul, vma>(
2306             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2307       case Decoder::VOpIVxOpcode::kVmsltvx:
2308         return OpVectorToMaskvx<intrinsics::Vsltvx<SignedType>, SignedType, vlmul, vma>(
2309             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2310       case Decoder::VOpIVxOpcode::kVmsleuvx:
2311         return OpVectorToMaskvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
2312             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2313       case Decoder::VOpIVxOpcode::kVmslevx:
2314         return OpVectorToMaskvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
2315             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2316       case Decoder::VOpIVxOpcode::kVmsgtuvx:
2317         return OpVectorToMaskvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
2318             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2319       case Decoder::VOpIVxOpcode::kVmsgtvx:
2320         return OpVectorToMaskvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
2321             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2322       case Decoder::VOpIVxOpcode::kVsadduvx:
2323         return OpVectorvx<intrinsics::Vaddvx<SaturatingUnsignedType>,
2324                           SaturatingUnsignedType,
2325                           vlmul,
2326                           vta,
2327                           vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2328       case Decoder::VOpIVxOpcode::kVsaddvx:
2329         return OpVectorvx<intrinsics::Vaddvx<SaturatingSignedType>,
2330                           SaturatingSignedType,
2331                           vlmul,
2332                           vta,
2333                           vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2334       case Decoder::VOpIVxOpcode::kVssubuvx:
2335         return OpVectorvx<intrinsics::Vsubvx<SaturatingUnsignedType>,
2336                           SaturatingUnsignedType,
2337                           vlmul,
2338                           vta,
2339                           vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2340       case Decoder::VOpIVxOpcode::kVssubvx:
2341         return OpVectorvx<intrinsics::Vsubvx<SaturatingSignedType>,
2342                           SaturatingSignedType,
2343                           vlmul,
2344                           vta,
2345                           vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2346       case Decoder::VOpIVxOpcode::kVsllvx:
2347         return OpVectorvx<intrinsics::Vslvx<ElementType>, ElementType, vlmul, vta, vma>(
2348             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2349       case Decoder::VOpIVxOpcode::kVsrlvx:
2350         return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2351             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2352       case Decoder::VOpIVxOpcode::kVsravx:
2353         return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
2354             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2355       case Decoder::VOpIVxOpcode::kVminuvx:
2356         return OpVectorvx<intrinsics::Vminvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2357             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2358       case Decoder::VOpIVxOpcode::kVminvx:
2359         return OpVectorvx<intrinsics::Vminvx<SignedType>, SignedType, vlmul, vta, vma>(
2360             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2361       case Decoder::VOpIVxOpcode::kVmaxuvx:
2362         return OpVectorvx<intrinsics::Vmaxvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2363             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2364       case Decoder::VOpIVxOpcode::kVmaxvx:
2365         return OpVectorvx<intrinsics::Vmaxvx<SignedType>, SignedType, vlmul, vta, vma>(
2366             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2367       case Decoder::VOpIVxOpcode::kVmergevx:
2368         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2369           if (args.src1 != 0) {
2370             return Undefined();
2371           }
2372           return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(
2373               args.dst, MaybeTruncateTo<ElementType>(arg2));
2374         } else {
2375           return OpVectorx<intrinsics::Vcopyx<ElementType>,
2376                            ElementType,
2377                            vlmul,
2378                            vta,
2379                            // Always use "undisturbed" value from source register.
2380                            InactiveProcessing::kUndisturbed>(
2381               args.dst, MaybeTruncateTo<ElementType>(arg2), /*dst_mask=*/args.src1);
2382         }
2383       case Decoder::VOpIVxOpcode::kVnsrawx:
2384         return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
2385             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2386       case Decoder::VOpIVxOpcode::kVnsrlwx:
2387         return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2388             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2389       case Decoder::VOpIVxOpcode::kVslideupvx:
2390         return OpVectorslideup<ElementType, vlmul, vta, vma>(
2391             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2392       case Decoder::VOpIVxOpcode::kVslidedownvx:
2393         return OpVectorslidedown<ElementType, vlmul, vta, vma>(
2394             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2395       case Decoder::VOpIVxOpcode::kVsmulvx:
2396         return OpVectorvx<intrinsics::Vsmulvx<SaturatingSignedType>,
2397                           SaturatingSignedType,
2398                           vlmul,
2399                           vta,
2400                           vma,
2401                           kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2402       case Decoder::VOpIVxOpcode::kVssrlvx:
2403         return OpVectorvx<intrinsics::Vssrvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2404             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2405       case Decoder::VOpIVxOpcode::kVssravx:
2406         return OpVectorvx<intrinsics::Vssrvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2407             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2408       case Decoder::VOpIVxOpcode::kVnclipuwx:
2409         return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>,
2410                                 SaturatingUnsignedType,
2411                                 vlmul,
2412                                 vta,
2413                                 vma,
2414                                 kVxrm>(args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2415       case Decoder::VOpIVxOpcode::kVnclipwx:
2416         return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>,
2417                                 SaturatingSignedType,
2418                                 vlmul,
2419                                 vta,
2420                                 vma,
2421                                 kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2422       default:
2423         Undefined();
2424     }
2425   }
2426 
2427   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpMVvArgs & args)2428   void OpVector(const Decoder::VOpMVvArgs& args) {
2429     using SignedType = berberis::SignedType<ElementType>;
2430     using UnsignedType = berberis::UnsignedType<ElementType>;
2431     if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2432       // Keep cases sorted in opcode order to match RISC-V V manual.
2433       switch (args.opcode) {
2434         case Decoder::VOpMVvOpcode::kVmandnmm:
2435           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & ~rhs; }>(
2436               args.dst, args.src1, args.src2);
2437         case Decoder::VOpMVvOpcode::kVmandmm:
2438           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & rhs; }>(
2439               args.dst, args.src1, args.src2);
2440         case Decoder::VOpMVvOpcode::kVmormm:
2441           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | rhs; }>(
2442               args.dst, args.src1, args.src2);
2443         case Decoder::VOpMVvOpcode::kVmxormm:
2444           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs ^ rhs; }>(
2445               args.dst, args.src1, args.src2);
2446         case Decoder::VOpMVvOpcode::kVmornmm:
2447           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | ~rhs; }>(
2448               args.dst, args.src1, args.src2);
2449         case Decoder::VOpMVvOpcode::kVmnandmm:
2450           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs & rhs); }>(
2451               args.dst, args.src1, args.src2);
2452         case Decoder::VOpMVvOpcode::kVmnormm:
2453           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs | rhs); }>(
2454               args.dst, args.src1, args.src2);
2455         case Decoder::VOpMVvOpcode::kVmxnormm:
2456           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs ^ rhs); }>(
2457               args.dst, args.src1, args.src2);
2458         default:;  // Do nothing: handled in next switch.
2459       }
2460     }
2461     // Keep cases sorted in opcode order to match RISC-V V manual.
2462     switch (args.opcode) {
2463       case Decoder::VOpMVvOpcode::kVredsumvs:
2464         return OpVectorvs<intrinsics::Vredsumvs<ElementType>, ElementType, vlmul, vta, vma>(
2465             args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2466       case Decoder::VOpMVvOpcode::kVredandvs:
2467         return OpVectorvs<intrinsics::Vredandvs<ElementType>, ElementType, vlmul, vta, vma>(
2468             args.dst, Vec<~ElementType{}>{args.src1}, args.src2);
2469       case Decoder::VOpMVvOpcode::kVredorvs:
2470         return OpVectorvs<intrinsics::Vredorvs<ElementType>, ElementType, vlmul, vta, vma>(
2471             args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2472       case Decoder::VOpMVvOpcode::kVredxorvs:
2473         return OpVectorvs<intrinsics::Vredxorvs<ElementType>, ElementType, vlmul, vta, vma>(
2474             args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2475       case Decoder::VOpMVvOpcode::kVredminuvs:
2476         return OpVectorvs<intrinsics::Vredminvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2477             args.dst,
2478             Vec<UnsignedType{std::numeric_limits<typename UnsignedType::BaseType>::max()}>{
2479                 args.src1},
2480             args.src2);
2481       case Decoder::VOpMVvOpcode::kVredminvs:
2482         return OpVectorvs<intrinsics::Vredminvs<SignedType>, SignedType, vlmul, vta, vma>(
2483             args.dst,
2484             Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::max()}>{args.src1},
2485             args.src2);
2486       case Decoder::VOpMVvOpcode::kVredmaxuvs:
2487         return OpVectorvs<intrinsics::Vredmaxvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2488             args.dst, Vec<UnsignedType{}>{args.src1}, args.src2);
2489       case Decoder::VOpMVvOpcode::kVredmaxvs:
2490         return OpVectorvs<intrinsics::Vredmaxvs<SignedType>, SignedType, vlmul, vta, vma>(
2491             args.dst,
2492             Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::min()}>{args.src1},
2493             args.src2);
2494       case Decoder::VOpMVvOpcode::kVaadduvv:
2495         return OpVectorvv<intrinsics::Vaaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2496             args.dst, args.src1, args.src2);
2497       case Decoder::VOpMVvOpcode::kVaaddvv:
2498         return OpVectorvv<intrinsics::Vaaddvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2499             args.dst, args.src1, args.src2);
2500       case Decoder::VOpMVvOpcode::kVasubuvv:
2501         return OpVectorvv<intrinsics::Vasubvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2502             args.dst, args.src1, args.src2);
2503       case Decoder::VOpMVvOpcode::kVasubvv:
2504         return OpVectorvv<intrinsics::Vasubvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2505             args.dst, args.src1, args.src2);
2506       case Decoder::VOpMVvOpcode::kVWXUnary0:
2507         switch (args.vwxunary0_opcode) {
2508           case Decoder::VWXUnary0Opcode::kVmvxs:
2509             if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2510               return Undefined();
2511             }
2512             return OpVectorVmvxs<SignedType>(args.dst, args.src1);
2513           case Decoder::VWXUnary0Opcode::kVcpopm:
2514             return OpVectorVWXUnary0<intrinsics::Vcpopm<>, vma>(args.dst, args.src1);
2515           case Decoder::VWXUnary0Opcode::kVfirstm:
2516             return OpVectorVWXUnary0<intrinsics::Vfirstm<>, vma>(args.dst, args.src1);
2517           default:
2518             return Undefined();
2519         }
2520       case Decoder::VOpMVvOpcode::kVFUnary0:
2521         switch (args.vxunary0_opcode) {
2522           case Decoder::VXUnary0Opcode::kVzextvf2m:
2523             if constexpr (sizeof(UnsignedType) >= 2) {
2524               return OpVectorVXUnary0<intrinsics::Vextf2<UnsignedType>,
2525                                       UnsignedType,
2526                                       2,
2527                                       vlmul,
2528                                       vta,
2529                                       vma>(args.dst, args.src1);
2530             }
2531             break;
2532           case Decoder::VXUnary0Opcode::kVsextvf2m:
2533             if constexpr (sizeof(SignedType) >= 2) {
2534               return OpVectorVXUnary0<intrinsics::Vextf2<SignedType>,
2535                                       SignedType,
2536                                       2,
2537                                       vlmul,
2538                                       vta,
2539                                       vma>(args.dst, args.src1);
2540             }
2541             break;
2542           case Decoder::VXUnary0Opcode::kVzextvf4m:
2543             if constexpr (sizeof(UnsignedType) >= 4) {
2544               return OpVectorVXUnary0<intrinsics::Vextf4<UnsignedType>,
2545                                       UnsignedType,
2546                                       4,
2547                                       vlmul,
2548                                       vta,
2549                                       vma>(args.dst, args.src1);
2550             }
2551             break;
2552           case Decoder::VXUnary0Opcode::kVsextvf4m:
2553             if constexpr (sizeof(SignedType) >= 4) {
2554               return OpVectorVXUnary0<intrinsics::Vextf4<SignedType>,
2555                                       SignedType,
2556                                       4,
2557                                       vlmul,
2558                                       vta,
2559                                       vma>(args.dst, args.src1);
2560             }
2561             break;
2562           case Decoder::VXUnary0Opcode::kVzextvf8m:
2563             if constexpr (sizeof(UnsignedType) >= 8) {
2564               return OpVectorVXUnary0<intrinsics::Vextf8<UnsignedType>,
2565                                       UnsignedType,
2566                                       8,
2567                                       vlmul,
2568                                       vta,
2569                                       vma>(args.dst, args.src1);
2570             }
2571             break;
2572           case Decoder::VXUnary0Opcode::kVsextvf8m:
2573             if constexpr (sizeof(SignedType) >= 8) {
2574               return OpVectorVXUnary0<intrinsics::Vextf8<SignedType>,
2575                                       SignedType,
2576                                       8,
2577                                       vlmul,
2578                                       vta,
2579                                       vma>(args.dst, args.src1);
2580             }
2581             break;
2582           default:
2583             return Undefined();
2584         }
2585         return Undefined();
2586       case Decoder::VOpMVvOpcode::kVMUnary0:
2587         switch (args.vmunary0_opcode) {
2588           case Decoder::VMUnary0Opcode::kVmsbfm:
2589             return OpVectorVMUnary0<intrinsics::Vmsbfm<>, vma>(args.dst, args.src1);
2590           case Decoder::VMUnary0Opcode::kVmsofm:
2591             return OpVectorVMUnary0<intrinsics::Vmsofm<>, vma>(args.dst, args.src1);
2592           case Decoder::VMUnary0Opcode::kVmsifm:
2593             return OpVectorVMUnary0<intrinsics::Vmsifm<>, vma>(args.dst, args.src1);
2594           case Decoder::VMUnary0Opcode::kViotam:
2595             return OpVectorViotam<ElementType, vlmul, vta, vma>(args.dst, args.src1);
2596           case Decoder::VMUnary0Opcode::kVidv:
2597             if (args.src1) {
2598               return Undefined();
2599             }
2600             return OpVectorVidv<ElementType, vlmul, vta, vma>(args.dst);
2601           default:
2602             return Undefined();
2603         }
2604       case Decoder::VOpMVvOpcode::kVdivuvv:
2605         return OpVectorvv<intrinsics::Vdivvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2606             args.dst, args.src1, args.src2);
2607       case Decoder::VOpMVvOpcode::kVdivvv:
2608         return OpVectorvv<intrinsics::Vdivvv<SignedType>, SignedType, vlmul, vta, vma>(
2609             args.dst, args.src1, args.src2);
2610       case Decoder::VOpMVvOpcode::kVremuvv:
2611         return OpVectorvv<intrinsics::Vremvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2612             args.dst, args.src1, args.src2);
2613       case Decoder::VOpMVvOpcode::kVremvv:
2614         return OpVectorvv<intrinsics::Vremvv<SignedType>, SignedType, vlmul, vta, vma>(
2615             args.dst, args.src1, args.src2);
2616       case Decoder::VOpMVvOpcode::kVmulhuvv:
2617         return OpVectorvv<intrinsics::Vmulhvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2618             args.dst, args.src1, args.src2);
2619       case Decoder::VOpMVvOpcode::kVmulvv:
2620         return OpVectorvv<intrinsics::Vmulvv<SignedType>, SignedType, vlmul, vta, vma>(
2621             args.dst, args.src1, args.src2);
2622       case Decoder::VOpMVvOpcode::kVmulhsuvv:
2623         return OpVectorvv<intrinsics::Vmulhsuvv<SignedType>, SignedType, vlmul, vta, vma>(
2624             args.dst, args.src1, args.src2);
2625       case Decoder::VOpMVvOpcode::kVmulhvv:
2626         return OpVectorvv<intrinsics::Vmulhvv<SignedType>, SignedType, vlmul, vta, vma>(
2627             args.dst, args.src1, args.src2);
2628       case Decoder::VOpMVvOpcode::kVmaddvv:
2629         return OpVectorvvv<intrinsics::Vmaddvv<ElementType>, ElementType, vlmul, vta, vma>(
2630             args.dst, args.src1, args.src2);
2631       case Decoder::VOpMVvOpcode::kVnmsubvv:
2632         return OpVectorvvv<intrinsics::Vnmsubvv<ElementType>, ElementType, vlmul, vta, vma>(
2633             args.dst, args.src1, args.src2);
2634       case Decoder::VOpMVvOpcode::kVmaccvv:
2635         return OpVectorvvv<intrinsics::Vmaccvv<ElementType>, ElementType, vlmul, vta, vma>(
2636             args.dst, args.src1, args.src2);
2637       case Decoder::VOpMVvOpcode::kVnmsacvv:
2638         return OpVectorvvv<intrinsics::Vnmsacvv<ElementType>, ElementType, vlmul, vta, vma>(
2639             args.dst, args.src1, args.src2);
2640       case Decoder::VOpMVvOpcode::kVwadduvv:
2641         return OpVectorWidenvv<intrinsics::Vwaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2642             args.dst, args.src1, args.src2);
2643       case Decoder::VOpMVvOpcode::kVwaddvv:
2644         return OpVectorWidenvv<intrinsics::Vwaddvv<SignedType>, SignedType, vlmul, vta, vma>(
2645             args.dst, args.src1, args.src2);
2646       case Decoder::VOpMVvOpcode::kVwsubuvv:
2647         return OpVectorWidenvv<intrinsics::Vwsubvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2648             args.dst, args.src1, args.src2);
2649       case Decoder::VOpMVvOpcode::kVwsubvv:
2650         return OpVectorWidenvv<intrinsics::Vwsubvv<SignedType>, SignedType, vlmul, vta, vma>(
2651             args.dst, args.src1, args.src2);
2652       case Decoder::VOpMVvOpcode::kVwadduwv:
2653         return OpVectorWidenwv<intrinsics::Vwaddwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2654             args.dst, args.src1, args.src2);
2655       case Decoder::VOpMVvOpcode::kVwaddwv:
2656         return OpVectorWidenwv<intrinsics::Vwaddwv<SignedType>, SignedType, vlmul, vta, vma>(
2657             args.dst, args.src1, args.src2);
2658       case Decoder::VOpMVvOpcode::kVwsubuwv:
2659         return OpVectorWidenwv<intrinsics::Vwsubwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2660             args.dst, args.src1, args.src2);
2661       case Decoder::VOpMVvOpcode::kVwsubwv:
2662         return OpVectorWidenwv<intrinsics::Vwsubwv<SignedType>, SignedType, vlmul, vta, vma>(
2663             args.dst, args.src1, args.src2);
2664       case Decoder::VOpMVvOpcode::kVwmuluvv:
2665         return OpVectorWidenvv<intrinsics::Vwmulvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2666             args.dst, args.src1, args.src2);
2667       case Decoder::VOpMVvOpcode::kVwmulsuvv:
2668         return OpVectorWidenvv<intrinsics::Vwmulsuvv<ElementType>, ElementType, vlmul, vta, vma>(
2669             args.dst, args.src1, args.src2);
2670       case Decoder::VOpMVvOpcode::kVwmulvv:
2671         return OpVectorWidenvv<intrinsics::Vwmulvv<SignedType>, SignedType, vlmul, vta, vma>(
2672             args.dst, args.src1, args.src2);
2673       case Decoder::VOpMVvOpcode::kVwmaccuvv:
2674         return OpVectorWidenvvw<intrinsics::Vwmaccvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2675             args.dst, args.src1, args.src2);
2676       case Decoder::VOpMVvOpcode::kVwmaccvv:
2677         return OpVectorWidenvvw<intrinsics::Vwmaccvv<SignedType>, SignedType, vlmul, vta, vma>(
2678             args.dst, args.src1, args.src2);
2679       case Decoder::VOpMVvOpcode::kVwmaccsuvv:
2680         return OpVectorWidenvvw<intrinsics::Vwmaccsuvv<ElementType>, ElementType, vlmul, vta, vma>(
2681             args.dst, args.src1, args.src2);
2682       default:
2683         Undefined();
2684     }
2685   }
2686 
2687   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpMVxArgs & args,Register arg2)2688   void OpVector(const Decoder::VOpMVxArgs& args, Register arg2) {
2689     using SignedType = berberis::SignedType<ElementType>;
2690     using UnsignedType = berberis::UnsignedType<ElementType>;
2691     // Keep cases sorted in opcode order to match RISC-V V manual.
2692     switch (args.opcode) {
2693       case Decoder::VOpMVxOpcode::kVaadduvx:
2694         return OpVectorvx<intrinsics::Vaaddvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2695             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2696       case Decoder::VOpMVxOpcode::kVaaddvx:
2697         return OpVectorvx<intrinsics::Vaaddvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2698             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2699       case Decoder::VOpMVxOpcode::kVasubuvx:
2700         return OpVectorvx<intrinsics::Vasubvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2701             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2702       case Decoder::VOpMVxOpcode::kVasubvx:
2703         return OpVectorvx<intrinsics::Vasubvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2704             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2705       case Decoder::VOpMVxOpcode::kVslide1upvx:
2706         return OpVectorslide1up<SignedType, vlmul, vta, vma>(
2707             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2708       case Decoder::VOpMVxOpcode::kVslide1downvx:
2709         return OpVectorslide1down<SignedType, vlmul, vta, vma>(
2710             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2711       case Decoder::VOpMVxOpcode::kVRXUnary0:
2712         switch (args.vrxunary0_opcode) {
2713           case Decoder::VRXUnary0Opcode::kVmvsx:
2714             if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2715               return Undefined();
2716             }
2717             return OpVectorVmvsx<SignedType, vta>(args.dst, MaybeTruncateTo<SignedType>(arg2));
2718           default:
2719             return Undefined();
2720         }
2721       case Decoder::VOpMVxOpcode::kVmulhuvx:
2722         return OpVectorvx<intrinsics::Vmulhvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2723             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2724       case Decoder::VOpMVxOpcode::kVmulvx:
2725         return OpVectorvx<intrinsics::Vmulvx<SignedType>, SignedType, vlmul, vta, vma>(
2726             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2727       case Decoder::VOpMVxOpcode::kVdivuvx:
2728         return OpVectorvx<intrinsics::Vdivvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2729             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2730       case Decoder::VOpMVxOpcode::kVdivvx:
2731         return OpVectorvx<intrinsics::Vdivvx<SignedType>, SignedType, vlmul, vta, vma>(
2732             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2733       case Decoder::VOpMVxOpcode::kVremuvx:
2734         return OpVectorvx<intrinsics::Vremvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2735             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2736       case Decoder::VOpMVxOpcode::kVremvx:
2737         return OpVectorvx<intrinsics::Vremvx<SignedType>, SignedType, vlmul, vta, vma>(
2738             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2739       case Decoder::VOpMVxOpcode::kVmulhsuvx:
2740         return OpVectorvx<intrinsics::Vmulhsuvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2741             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2742       case Decoder::VOpMVxOpcode::kVmulhvx:
2743         return OpVectorvx<intrinsics::Vmulhvx<SignedType>, SignedType, vlmul, vta, vma>(
2744             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2745       case Decoder::VOpMVxOpcode::kVmaddvx:
2746         return OpVectorvxv<intrinsics::Vmaddvx<ElementType>, ElementType, vlmul, vta, vma>(
2747             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2748       case Decoder::VOpMVxOpcode::kVnmsubvx:
2749         return OpVectorvxv<intrinsics::Vnmsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2750             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2751       case Decoder::VOpMVxOpcode::kVmaccvx:
2752         return OpVectorvxv<intrinsics::Vmaccvx<ElementType>, ElementType, vlmul, vta, vma>(
2753             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2754       case Decoder::VOpMVxOpcode::kVnmsacvx:
2755         return OpVectorvxv<intrinsics::Vnmsacvx<ElementType>, ElementType, vlmul, vta, vma>(
2756             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2757       case Decoder::VOpMVxOpcode::kVwadduvx:
2758         return OpVectorWidenvx<intrinsics::Vwaddvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2759             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2760       case Decoder::VOpMVxOpcode::kVwaddvx:
2761         return OpVectorWidenvx<intrinsics::Vwaddvx<SignedType>, SignedType, vlmul, vta, vma>(
2762             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2763       case Decoder::VOpMVxOpcode::kVwsubuvx:
2764         return OpVectorWidenvx<intrinsics::Vwsubvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2765             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2766       case Decoder::VOpMVxOpcode::kVwsubvx:
2767         return OpVectorWidenvx<intrinsics::Vwsubvx<SignedType>, SignedType, vlmul, vta, vma>(
2768             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2769       case Decoder::VOpMVxOpcode::kVwadduwx:
2770         return OpVectorWidenwx<intrinsics::Vwaddwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2771             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2772       case Decoder::VOpMVxOpcode::kVwaddwx:
2773         return OpVectorWidenwx<intrinsics::Vwaddwx<SignedType>, SignedType, vlmul, vta, vma>(
2774             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2775       case Decoder::VOpMVxOpcode::kVwsubuwx:
2776         return OpVectorWidenwx<intrinsics::Vwsubwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2777             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2778       case Decoder::VOpMVxOpcode::kVwsubwx:
2779         return OpVectorWidenwx<intrinsics::Vwsubwx<SignedType>, SignedType, vlmul, vta, vma>(
2780             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2781       case Decoder::VOpMVxOpcode::kVwmuluvx:
2782         return OpVectorWidenvx<intrinsics::Vwmulvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2783             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2784       case Decoder::VOpMVxOpcode::kVwmulsuvx:
2785         return OpVectorWidenvx<intrinsics::Vwmulsuvx<ElementType>, ElementType, vlmul, vta, vma>(
2786             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2787       case Decoder::VOpMVxOpcode::kVwmulvx:
2788         return OpVectorWidenvx<intrinsics::Vwmulvx<SignedType>, SignedType, vlmul, vta, vma>(
2789             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2790       case Decoder::VOpMVxOpcode::kVwmaccuvx:
2791         return OpVectorWidenvxw<intrinsics::Vwmaccvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2792             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2793       case Decoder::VOpMVxOpcode::kVwmaccvx:
2794         return OpVectorWidenvxw<intrinsics::Vwmaccvx<SignedType>, SignedType, vlmul, vta, vma>(
2795             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2796       case Decoder::VOpMVxOpcode::kVwmaccusvx:
2797         return OpVectorWidenvxw<intrinsics::Vwmaccusvx<ElementType>, ElementType, vlmul, vta, vma>(
2798             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2799       case Decoder::VOpMVxOpcode::kVwmaccsuvx:
2800         return OpVectorWidenvxw<intrinsics::Vwmaccsuvx<ElementType>, ElementType, vlmul, vta, vma>(
2801             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2802       default:
2803         Undefined();
2804     }
2805   }
2806 
2807   template <typename DataElementType,
2808             VectorRegisterGroupMultiplier vlmul,
2809             typename IndexElementType,
2810             size_t kSegmentSize,
2811             size_t kIndexRegistersInvolved,
2812             TailProcessing vta,
2813             auto vma>
OpVector(const Decoder::VStoreIndexedArgs & args,Register src)2814   void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
2815     return OpVector<DataElementType,
2816                     kSegmentSize,
2817                     NumberOfRegistersInvolved(vlmul),
2818                     IndexElementType,
2819                     kIndexRegistersInvolved,
2820                     !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(args, src);
2821   }
2822 
2823   template <typename DataElementType,
2824             size_t kSegmentSize,
2825             size_t kNumRegistersInGroup,
2826             typename IndexElementType,
2827             size_t kIndexRegistersInvolved,
2828             bool kUseMasking>
OpVector(const Decoder::VStoreIndexedArgs & args,Register src)2829   void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
2830     if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
2831       return Undefined();
2832     }
2833     constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
2834     alignas(alignof(SIMD128Register))
2835         IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
2836     memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
2837     return OpVectorStore<DataElementType, kSegmentSize, kNumRegistersInGroup, kUseMasking>(
2838         args.data, src, [&indexes](size_t index) { return indexes[index]; });
2839   }
2840 
2841   template <typename ElementType,
2842             size_t kSegmentSize,
2843             VectorRegisterGroupMultiplier vlmul,
2844             TailProcessing vta,
2845             auto vma>
OpVector(const Decoder::VStoreStrideArgs & args,Register src,Register stride)2846   void OpVector(const Decoder::VStoreStrideArgs& args, Register src, Register stride) {
2847     return OpVectorStore<ElementType,
2848                          kSegmentSize,
2849                          NumberOfRegistersInvolved(vlmul),
2850                          !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(
2851         args.data, src, [stride](size_t index) { return stride * index; });
2852   }
2853 
2854   template <typename ElementType,
2855             size_t kSegmentSize,
2856             VectorRegisterGroupMultiplier vlmul,
2857             TailProcessing vta,
2858             auto vma>
OpVector(const Decoder::VStoreUnitStrideArgs & args,Register src)2859   void OpVector(const Decoder::VStoreUnitStrideArgs& args, Register src) {
2860     switch (args.opcode) {
2861       case Decoder::VSUmOpOpcode::kVseXX:
2862         return OpVectorStore<ElementType,
2863                              kSegmentSize,
2864                              NumberOfRegistersInvolved(vlmul),
2865                              !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
2866                              Decoder::VSUmOpOpcode::kVseXX>(args.data, src, [](size_t index) {
2867           return kSegmentSize * sizeof(ElementType) * index;
2868         });
2869       case Decoder::VSUmOpOpcode::kVsm:
2870         if constexpr (kSegmentSize == 1 &&
2871                       std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2872           return OpVectorStore<UInt8,
2873                                1,
2874                                1,
2875                                /*kUseMasking=*/false,
2876                                Decoder::VSUmOpOpcode::kVsm>(
2877               args.data, src, [](size_t index) { return index; });
2878         }
2879         return Undefined();
2880       default:
2881         return Undefined();
2882     }
2883   }
2884 
2885   // Look for VLoadStrideArgs for explanation about semantics: VStoreStrideArgs is almost symmetric,
2886   // except it ignores vta and vma modes and never alters inactive elements in memory.
2887   template <typename ElementType,
2888             size_t kSegmentSize,
2889             size_t kNumRegistersInGroup,
2890             bool kUseMasking,
2891             typename Decoder::VSUmOpOpcode opcode = typename Decoder::VSUmOpOpcode{},
2892             typename GetElementOffsetLambdaType>
2893   void OpVectorStore(uint8_t data, Register src, GetElementOffsetLambdaType GetElementOffset) {
2894     using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
2895     if (!IsAligned<kNumRegistersInGroup>(data)) {
2896       return Undefined();
2897     }
2898     if (data + kNumRegistersInGroup * kSegmentSize > 32) {
2899       return Undefined();
2900     }
2901     constexpr size_t kElementsCount = 16 / sizeof(ElementType);
2902     size_t vstart = GetCsr<CsrName::kVstart>();
2903     size_t vl = GetCsr<CsrName::kVl>();
2904     if constexpr (opcode == Decoder::VSUmOpOpcode::kVsm) {
2905       vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
2906     }
2907     // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
2908     // simplify the logic below.
2909     SetCsr<CsrName::kVstart>(0);
2910     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
2911     // vector register group, including that no tail elements are updated with agnostic values.
2912     if (vstart >= vl) [[unlikely]] {
2913       // Technically, since stores never touch tail elements it's not needed, but makes it easier to
2914       // reason about the rest of function.
2915       return;
2916     }
2917     char* ptr = ToHostAddr<char>(src);
2918     // Note: within_group_id is the current register id within a register group. During one
2919     // iteration of this loop we store results for all registers with the current id in all
2920     // groups. E.g. for the example above we'd store data from v0, v2, v4 during the first iteration
2921     // (id within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
2922     // ensures that memory is always accessed in ordered fashion.
2923     auto mask = GetMaskForVectorOperationsIfNeeded<kUseMasking>();
2924     for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
2925          ++within_group_id) {
2926       // No need to continue if we no longer have elements to store.
2927       if (within_group_id * kElementsCount >= vl) {
2928         break;
2929       }
2930       auto register_mask =
2931           std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
2932       // Store elements to memory, but only if there are any active ones.
2933       for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
2934            ++within_register_id) {
2935         size_t element_index = kElementsCount * within_group_id + within_register_id;
2936         // Stop if we reached the vl limit.
2937         if (vl <= element_index) {
2938           break;
2939         }
2940         // Don't touch masked-out elements.
2941         if constexpr (kUseMasking) {
2942           if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
2943                                              1 << within_register_id)}) == MaskType{0}) {
2944             continue;
2945           }
2946         }
2947         // Store segment to memory.
2948         for (size_t field = 0; field < kSegmentSize; ++field) {
2949           bool exception_raised = FaultyStore(
2950               ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
2951               sizeof(ElementType),
2952               SIMD128Register{state_->cpu.v[data + within_group_id + field * kNumRegistersInGroup]}
2953                   .Get<ElementType>(within_register_id));
2954           // Stop processing if memory is inaccessible. It's also the only case where we have to set
2955           // vstart to non-zero value!
2956           if (exception_raised) {
2957             SetCsr<CsrName::kVstart>(element_index);
2958             return;
2959           }
2960         }
2961       }
2962       // Next group should be fully processed.
2963       vstart = 0;
2964     }
2965   }
2966 
2967   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorViotam(uint8_t dst,uint8_t src1)2968   void OpVectorViotam(uint8_t dst, uint8_t src1) {
2969     return OpVectorViotam<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src1);
2970   }
2971 
2972   template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorViotam(uint8_t dst,uint8_t src1)2973   void OpVectorViotam(uint8_t dst, uint8_t src1) {
2974     constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
2975     size_t vstart = GetCsr<CsrName::kVstart>();
2976     size_t vl = GetCsr<CsrName::kVl>();
2977     if (vstart != 0) {
2978       return Undefined();
2979     }
2980     // When vl = 0, there are no body elements, and no elements are updated in any destination
2981     // vector register group, including that no tail elements are updated with agnostic values.
2982     if (vl == 0) [[unlikely]] {
2983       return;
2984     }
2985     SIMD128Register arg1(state_->cpu.v[src1]);
2986     auto mask = GetMaskForVectorOperations<vma>();
2987     if constexpr (std::is_same_v<decltype(mask), SIMD128Register>) {
2988       arg1 &= mask;
2989     }
2990 
2991     size_t counter = 0;
2992     for (size_t index = 0; index < kRegistersInvolved; ++index) {
2993       SIMD128Register result{state_->cpu.v[dst + index]};
2994       auto [original_dst_value, new_counter] = intrinsics::Viotam<ElementType>(arg1, counter);
2995       arg1.Set(arg1.Get<__uint128_t>() >> kElementsCount);
2996       counter = new_counter;
2997 
2998       // Apply mask and put result values into dst register.
2999       result =
3000           VectorMasking<ElementType, vta, vma>(result, original_dst_value, vstart, vl, index, mask);
3001       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3002     }
3003   }
3004 
3005   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorVidv(uint8_t dst)3006   void OpVectorVidv(uint8_t dst) {
3007     return OpVectorVidv<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst);
3008   }
3009 
3010   template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorVidv(uint8_t dst)3011   void OpVectorVidv(uint8_t dst) {
3012     if (!IsAligned<kRegistersInvolved>(dst)) {
3013       return Undefined();
3014     }
3015     size_t vstart = GetCsr<CsrName::kVstart>();
3016     size_t vl = GetCsr<CsrName::kVl>();
3017     SetCsr<CsrName::kVstart>(0);
3018     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3019     // vector register group, including that no tail elements are updated with agnostic values.
3020     if (vstart >= vl) [[unlikely]] {
3021       return;
3022     }
3023     auto mask = GetMaskForVectorOperations<vma>();
3024     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3025       SIMD128Register result{state_->cpu.v[dst + index]};
3026       result = VectorMasking<ElementType, vta, vma>(
3027           result, std::get<0>(intrinsics::Vidv<ElementType>(index)), vstart, vl, index, mask);
3028       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3029     }
3030   }
3031 
3032   template <typename ElementType>
OpVectorVmvfs(uint8_t dst,uint8_t src)3033   void OpVectorVmvfs(uint8_t dst, uint8_t src) {
3034     // Note: intrinsics::NanBox always received Float64 argument, even if it processes Float32 value
3035     // to not cause recursion in interinsics handling.
3036     // NanBox in the interpreter takes FpRegister and returns FpRegister which is probably the
3037     // cleanest way of processing that data (at least on x86-64 this produces code that's close to
3038     // optimal).
3039     NanBoxAndSetFpReg<ElementType>(dst, SIMD128Register{state_->cpu.v[src]}.Get<FpRegister>(0));
3040     SetCsr<CsrName::kVstart>(0);
3041   }
3042 
3043   template <typename ElementType, TailProcessing vta>
OpVectorVmvsx(uint8_t dst,ElementType element)3044   void OpVectorVmvsx(uint8_t dst, ElementType element) {
3045     size_t vstart = GetCsr<CsrName::kVstart>();
3046     size_t vl = GetCsr<CsrName::kVl>();
3047     // Documentation doesn't specify what happenes when vstart is non-zero but less than vl.
3048     // But at least one hardware implementation treats it as NOP:
3049     //   https://github.com/riscv/riscv-v-spec/issues/937
3050     // We are doing the same here.
3051     if (vstart == 0 && vl != 0) [[likely]] {
3052       SIMD128Register result;
3053       if constexpr (vta == intrinsics::TailProcessing::kAgnostic) {
3054         result = ~SIMD128Register{};
3055       } else {
3056         result.Set(state_->cpu.v[dst]);
3057       }
3058       result.Set(element, 0);
3059       state_->cpu.v[dst] = result.Get<Int128>();
3060     }
3061     SetCsr<CsrName::kVstart>(0);
3062   }
3063 
3064   template <typename ElementType>
OpVectorVmvxs(uint8_t dst,uint8_t src1)3065   void OpVectorVmvxs(uint8_t dst, uint8_t src1) {
3066     static_assert(ElementType::kIsSigned);
3067     // Conversion to Int64 would perform sign-extension if source element is signed.
3068     Register element = Int64{SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0)};
3069     SetRegOrIgnore(dst, element);
3070     SetCsr<CsrName::kVstart>(0);
3071   }
3072 
3073   template <auto Intrinsic, auto vma>
OpVectorVWXUnary0(uint8_t dst,uint8_t src1)3074   void OpVectorVWXUnary0(uint8_t dst, uint8_t src1) {
3075     size_t vstart = GetCsr<CsrName::kVstart>();
3076     size_t vl = GetCsr<CsrName::kVl>();
3077     if (vstart != 0) [[unlikely]] {
3078       return Undefined();
3079     }
3080     // Note: vcpop.m  and vfirst.m are explicit exception to the rule that vstart >= vl doesn't
3081     // perform any operations, and they are explicitly defined to perform write even if vl == 0.
3082     SIMD128Register arg1(state_->cpu.v[src1]);
3083     if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3084       SIMD128Register mask(state_->cpu.v[0]);
3085       arg1 &= mask;
3086     }
3087     const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3088     arg1 &= ~tail_mask;
3089     SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
3090     SetRegOrIgnore(dst, TruncateTo<UInt64>(BitCastToUnsigned(result.Get<Int128>())));
3091   }
3092 
3093   template <auto Intrinsic>
OpVectormm(uint8_t dst,uint8_t src1,uint8_t src2)3094   void OpVectormm(uint8_t dst, uint8_t src1, uint8_t src2) {
3095     size_t vstart = GetCsr<CsrName::kVstart>();
3096     size_t vl = GetCsr<CsrName::kVl>();
3097     SetCsr<CsrName::kVstart>(0);
3098     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3099     // vector register group, including that no tail elements are updated with agnostic values.
3100     if (vstart >= vl) [[unlikely]] {
3101       return;
3102     }
3103     SIMD128Register arg1(state_->cpu.v[src1]);
3104     SIMD128Register arg2(state_->cpu.v[src2]);
3105     SIMD128Register result;
3106     if (vstart > 0) [[unlikely]] {
3107       const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
3108       result.Set(state_->cpu.v[dst]);
3109       result = (result & ~start_mask) | (Intrinsic(arg1, arg2) & start_mask);
3110     } else {
3111       result = Intrinsic(arg1, arg2);
3112     }
3113     const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3114     result = result | tail_mask;
3115     state_->cpu.v[dst] = result.Get<__uint128_t>();
3116   }
3117 
3118   template <auto Intrinsic, auto vma>
OpVectorVMUnary0(uint8_t dst,uint8_t src1)3119   void OpVectorVMUnary0(uint8_t dst, uint8_t src1) {
3120     size_t vstart = GetCsr<CsrName::kVstart>();
3121     size_t vl = GetCsr<CsrName::kVl>();
3122     if (vstart != 0) {
3123       return Undefined();
3124     }
3125     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3126     // vector register group, including that no tail elements are updated with agnostic values.
3127     if (vl == 0) [[unlikely]] {
3128       return;
3129     }
3130     SIMD128Register arg1(state_->cpu.v[src1]);
3131     SIMD128Register mask;
3132     if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3133       mask.Set<__uint128_t>(state_->cpu.v[0]);
3134       arg1 &= mask;
3135     }
3136     const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3137     arg1 &= ~tail_mask;
3138     SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
3139     if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3140       arg1 &= mask;
3141       if (vma == InactiveProcessing::kUndisturbed) {
3142         result = (result & mask) | (SIMD128Register(state_->cpu.v[dst]) & ~mask);
3143       } else {
3144         result |= ~mask;
3145       }
3146     }
3147     result |= tail_mask;
3148     state_->cpu.v[dst] = result.Get<__uint128_t>();
3149   }
3150 
3151   template <typename ElementType, size_t kRegistersInvolved>
OpVectorVmvXrv(uint8_t dst,uint8_t src)3152   void OpVectorVmvXrv(uint8_t dst, uint8_t src) {
3153     if (!IsAligned<kRegistersInvolved>(dst | src)) {
3154       return Undefined();
3155     }
3156     constexpr size_t kElementsCount = 16 / sizeof(ElementType);
3157     size_t vstart = GetCsr<CsrName::kVstart>();
3158     SetCsr<CsrName::kVstart>(0);
3159     // The usual property that no elements are written if vstart >= vl does not apply to these
3160     // instructions. Instead, no elements are written if vstart >= evl.
3161     if (vstart >= kElementsCount * kRegistersInvolved) [[unlikely]] {
3162       return;
3163     }
3164     if (vstart == 0) [[likely]] {
3165       for (size_t index = 0; index < kRegistersInvolved; ++index) {
3166         state_->cpu.v[dst + index] = state_->cpu.v[src + index];
3167       }
3168       return;
3169     }
3170     size_t index = vstart / kElementsCount;
3171     SIMD128Register destination{state_->cpu.v[dst + index]};
3172     SIMD128Register source{state_->cpu.v[src + index]};
3173     for (size_t element_index = vstart % kElementsCount; element_index < kElementsCount;
3174          ++element_index) {
3175       destination.Set(source.Get<ElementType>(element_index), element_index);
3176     }
3177     state_->cpu.v[dst + index] = destination.Get<__uint128_t>();
3178     for (index++; index < kRegistersInvolved; ++index) {
3179       state_->cpu.v[dst + index] = state_->cpu.v[src + index];
3180     }
3181   }
3182 
3183   template <auto Intrinsic,
3184             typename ElementType,
3185             VectorRegisterGroupMultiplier vlmul,
3186             auto vma,
3187             CsrName... kExtraCsrs>
OpVectorToMaskvv(uint8_t dst,uint8_t src1,uint8_t src2)3188   void OpVectorToMaskvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3189     return OpVectorToMask<Intrinsic,
3190                           ElementType,
3191                           NumberOfRegistersInvolved(vlmul),
3192                           vma,
3193                           kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3194   }
3195 
3196   template <auto Intrinsic,
3197             typename ElementType,
3198             VectorRegisterGroupMultiplier vlmul,
3199             auto vma,
3200             CsrName... kExtraCsrs>
OpVectorToMaskvx(uint8_t dst,uint8_t src1,ElementType arg2)3201   void OpVectorToMaskvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3202     return OpVectorToMask<Intrinsic,
3203                           ElementType,
3204                           NumberOfRegistersInvolved(vlmul),
3205                           vma,
3206                           kExtraCsrs...>(dst, Vec{src1}, arg2);
3207   }
3208 
3209   template <auto Intrinsic,
3210             typename ElementType,
3211             size_t kRegistersInvolved,
3212             auto vma,
3213             CsrName... kExtraCsrs,
3214             typename... Args>
OpVectorToMask(uint8_t dst,Args...args)3215   void OpVectorToMask(uint8_t dst, Args... args) {
3216     // All args, except dst must be aligned at kRegistersInvolved amount. We'll merge them
3217     // together and then do a combined check for all of them at once.
3218     if (!IsAligned<kRegistersInvolved>(OrValuesOnlyForType<Vec>(args...))) {
3219       return Undefined();
3220     }
3221     SIMD128Register original_result(state_->cpu.v[dst]);
3222     size_t vstart = GetCsr<CsrName::kVstart>();
3223     size_t vl = GetCsr<CsrName::kVl>();
3224     SetCsr<CsrName::kVstart>(0);
3225     SIMD128Register result_before_vl_masking;
3226     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3227     // vector register group, including that no tail elements are updated with agnostic values.
3228     if (vstart >= vl) [[unlikely]] {
3229       result_before_vl_masking = original_result;
3230     } else {
3231       result_before_vl_masking = CollectBitmaskResult<ElementType, kRegistersInvolved>(
3232           [this, vstart, vl, args...](auto index) {
3233             return Intrinsic(this->GetCsr<kExtraCsrs>()...,
3234                              this->GetVectorArgument<ElementType, TailProcessing::kAgnostic, vma>(
3235                                  args, vstart, vl, index, intrinsics::NoInactiveProcessing{})...);
3236           });
3237       if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3238         SIMD128Register mask(state_->cpu.v[0]);
3239         if constexpr (vma == InactiveProcessing::kAgnostic) {
3240           result_before_vl_masking |= ~mask;
3241         } else {
3242           result_before_vl_masking = (mask & result_before_vl_masking) | (original_result & ~mask);
3243         }
3244       }
3245       if (vstart > 0) [[unlikely]] {
3246         const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
3247         result_before_vl_masking =
3248             (original_result & ~start_mask) | (result_before_vl_masking & start_mask);
3249       }
3250     }
3251     const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3252     state_->cpu.v[dst] = (result_before_vl_masking | tail_mask).Get<__uint128_t>();
3253   }
3254 
3255   template <auto Intrinsic,
3256             typename ElementType,
3257             VectorRegisterGroupMultiplier vlmul,
3258             TailProcessing vta,
3259             auto vma,
3260             CsrName... kExtraCsrs,
3261             typename... DstMaskType>
OpVectorv(uint8_t dst,uint8_t src1,DstMaskType...dst_mask)3262   void OpVectorv(uint8_t dst, uint8_t src1, DstMaskType... dst_mask) {
3263     return OpVectorv<Intrinsic,
3264                      ElementType,
3265                      NumberOfRegistersInvolved(vlmul),
3266                      vta,
3267                      vma,
3268                      kExtraCsrs...>(dst, src1, dst_mask...);
3269   }
3270 
3271   template <auto Intrinsic,
3272             typename ElementType,
3273             size_t kRegistersInvolved,
3274             TailProcessing vta,
3275             auto vma,
3276             CsrName... kExtraCsrs,
3277             typename... DstMaskType>
OpVectorv(uint8_t dst,uint8_t src,DstMaskType...dst_mask)3278   void OpVectorv(uint8_t dst, uint8_t src, DstMaskType... dst_mask) {
3279     static_assert(sizeof...(dst_mask) <= 1);
3280     if (!IsAligned<kRegistersInvolved>(dst | src | (dst_mask | ... | 0))) {
3281       return Undefined();
3282     }
3283     size_t vstart = GetCsr<CsrName::kVstart>();
3284     size_t vl = GetCsr<CsrName::kVl>();
3285     SetCsr<CsrName::kVstart>(0);
3286     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3287     // vector register group, including that no tail elements are updated with agnostic values.
3288     if (vstart >= vl) [[unlikely]] {
3289       return;
3290     }
3291     auto mask = GetMaskForVectorOperations<vma>();
3292     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3293       SIMD128Register result{state_->cpu.v[dst + index]};
3294       SIMD128Register result_mask;
3295       if constexpr (sizeof...(DstMaskType) == 0) {
3296         result_mask.Set(state_->cpu.v[dst + index]);
3297       } else {
3298         uint8_t dst_mask_unpacked[1] = {dst_mask...};
3299         result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
3300       }
3301       SIMD128Register arg{state_->cpu.v[src + index]};
3302       result =
3303           VectorMasking<ElementType, vta, vma>(result,
3304                                                std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg)),
3305                                                result_mask,
3306                                                vstart,
3307                                                vl,
3308                                                index,
3309                                                mask);
3310       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3311     }
3312   }
3313 
3314   template <auto Intrinsic,
3315             typename ElementType,
3316             VectorRegisterGroupMultiplier vlmul,
3317             TailProcessing vta,
3318             auto vma,
3319             CsrName... kExtraCsrs,
3320             auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3321   void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3322     return OpVectorvs<Intrinsic,
3323                       ElementType,
3324                       NumberOfRegistersInvolved(vlmul),
3325                       vta,
3326                       vma,
3327                       kExtraCsrs...>(dst, src1, src2);
3328   }
3329 
3330   template <auto Intrinsic,
3331             typename ElementType,
3332             size_t kRegistersInvolved,
3333             TailProcessing vta,
3334             auto vma,
3335             CsrName... kExtraCsrs,
3336             auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3337   void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3338     if (!IsAligned<kRegistersInvolved>(dst | src1.start_no)) {
3339       return Undefined();
3340     }
3341     size_t vstart = GetCsr<CsrName::kVstart>();
3342     size_t vl = GetCsr<CsrName::kVl>();
3343     if (vstart != 0) {
3344       return Undefined();
3345     }
3346     SetCsr<CsrName::kVstart>(0);
3347     // If vl = 0, no operation is performed and the destination register is not updated.
3348     if (vl == 0) [[unlikely]] {
3349       return;
3350     }
3351     auto mask = GetMaskForVectorOperations<vma>();
3352     ElementType init = SIMD128Register{state_->cpu.v[src2]}.Get<ElementType>(0);
3353     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3354       init = std::get<0>(
3355           Intrinsic(GetCsr<kExtraCsrs>()...,
3356                     init,
3357                     GetVectorArgument<ElementType, vta, vma>(src1, vstart, vl, index, mask)));
3358     }
3359     SIMD128Register result{state_->cpu.v[dst]};
3360     result.Set(init, 0);
3361     result = std::get<0>(intrinsics::VectorMasking<ElementType, vta>(result, result, 0, 1));
3362     state_->cpu.v[dst] = result.Get<__uint128_t>();
3363   }
3364 
3365   template <auto Intrinsic,
3366             typename ElementType,
3367             VectorRegisterGroupMultiplier vlmul,
3368             TailProcessing vta,
3369             auto vma,
3370             CsrName... kExtraCsrs>
OpVectorvv(uint8_t dst,uint8_t src1,uint8_t src2)3371   void OpVectorvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3372     return OpVectorSameWidth<Intrinsic,
3373                              ElementType,
3374                              NumberOfRegistersInvolved(vlmul),
3375                              vta,
3376                              vma,
3377                              kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3378   }
3379 
3380   template <auto Intrinsic,
3381             typename ElementType,
3382             VectorRegisterGroupMultiplier vlmul,
3383             TailProcessing vta,
3384             auto vma,
3385             CsrName... kExtraCsrs>
OpVectorvvv(uint8_t dst,uint8_t src1,uint8_t src2)3386   void OpVectorvvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3387     return OpVectorSameWidth<Intrinsic,
3388                              ElementType,
3389                              NumberOfRegistersInvolved(vlmul),
3390                              vta,
3391                              vma,
3392                              kExtraCsrs...>(dst, Vec{src1}, Vec{src2}, Vec{dst});
3393   }
3394 
3395   template <auto Intrinsic,
3396             typename ElementType,
3397             VectorRegisterGroupMultiplier vlmul,
3398             TailProcessing vta,
3399             auto vma,
3400             CsrName... kExtraCsrs>
OpVectorWidenv(uint8_t dst,uint8_t src)3401   void OpVectorWidenv(uint8_t dst, uint8_t src) {
3402     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3403                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3404       return OpVectorWiden<Intrinsic,
3405                            ElementType,
3406                            NumRegistersInvolvedForWideOperand(vlmul),
3407                            NumberOfRegistersInvolved(vlmul),
3408                            vta,
3409                            vma,
3410                            kExtraCsrs...>(dst, Vec{src});
3411     }
3412     return Undefined();
3413   }
3414 
3415   // 2*SEW = SEW op SEW
3416   // Attention: not to confuse with OpVectorWidenwv with 2*SEW = 2*SEW op SEW
3417   template <auto Intrinsic,
3418             typename ElementType,
3419             VectorRegisterGroupMultiplier vlmul,
3420             TailProcessing vta,
3421             auto vma,
3422             CsrName... kExtraCsrs>
OpVectorWidenvv(uint8_t dst,uint8_t src1,uint8_t src2)3423   void OpVectorWidenvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3424     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3425                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3426       return OpVectorWiden<Intrinsic,
3427                            ElementType,
3428                            NumRegistersInvolvedForWideOperand(vlmul),
3429                            NumberOfRegistersInvolved(vlmul),
3430                            vta,
3431                            vma,
3432                            kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3433     }
3434     return Undefined();
3435   }
3436 
3437   // 2*SEW = SEW op SEW op 2*SEW
3438   template <auto Intrinsic,
3439             typename ElementType,
3440             VectorRegisterGroupMultiplier vlmul,
3441             TailProcessing vta,
3442             auto vma,
3443             CsrName... kExtraCsrs>
OpVectorWidenvvw(uint8_t dst,uint8_t src1,uint8_t src2)3444   void OpVectorWidenvvw(uint8_t dst, uint8_t src1, uint8_t src2) {
3445     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3446                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3447       return OpVectorWiden<Intrinsic,
3448                            ElementType,
3449                            NumRegistersInvolvedForWideOperand(vlmul),
3450                            NumberOfRegistersInvolved(vlmul),
3451                            vta,
3452                            vma,
3453                            kExtraCsrs...>(dst, Vec{src1}, Vec{src2}, WideVec{dst});
3454     }
3455     return Undefined();
3456   }
3457 
3458   // 2*SEW = 2*SEW op SEW
3459   template <auto Intrinsic,
3460             typename ElementType,
3461             VectorRegisterGroupMultiplier vlmul,
3462             TailProcessing vta,
3463             auto vma,
3464             CsrName... kExtraCsrs>
OpVectorWidenwv(uint8_t dst,uint8_t src1,uint8_t src2)3465   void OpVectorWidenwv(uint8_t dst, uint8_t src1, uint8_t src2) {
3466     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3467                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3468       return OpVectorWiden<Intrinsic,
3469                            ElementType,
3470                            NumRegistersInvolvedForWideOperand(vlmul),
3471                            NumberOfRegistersInvolved(vlmul),
3472                            vta,
3473                            vma,
3474                            kExtraCsrs...>(dst, WideVec{src1}, Vec{src2});
3475     }
3476     return Undefined();
3477   }
3478 
3479   template <auto Intrinsic,
3480             typename ElementType,
3481             VectorRegisterGroupMultiplier vlmul,
3482             TailProcessing vta,
3483             auto vma,
3484             CsrName... kExtraCsrs>
OpVectorWidenwx(uint8_t dst,uint8_t src1,ElementType arg2)3485   void OpVectorWidenwx(uint8_t dst, uint8_t src1, ElementType arg2) {
3486     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3487                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3488       return OpVectorWiden<Intrinsic,
3489                            ElementType,
3490                            NumRegistersInvolvedForWideOperand(vlmul),
3491                            NumberOfRegistersInvolved(vlmul),
3492                            vta,
3493                            vma,
3494                            kExtraCsrs...>(dst, WideVec{src1}, arg2);
3495     }
3496     return Undefined();
3497   }
3498 
3499   template <auto Intrinsic,
3500             typename ElementType,
3501             VectorRegisterGroupMultiplier vlmul,
3502             TailProcessing vta,
3503             auto vma,
3504             CsrName... kExtraCsrs>
OpVectorWidenvx(uint8_t dst,uint8_t src1,ElementType arg2)3505   void OpVectorWidenvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3506     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3507                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3508       return OpVectorWiden<Intrinsic,
3509                            ElementType,
3510                            NumRegistersInvolvedForWideOperand(vlmul),
3511                            NumberOfRegistersInvolved(vlmul),
3512                            vta,
3513                            vma,
3514                            kExtraCsrs...>(dst, Vec{src1}, arg2);
3515     }
3516     return Undefined();
3517   }
3518 
3519   template <auto Intrinsic,
3520             typename ElementType,
3521             VectorRegisterGroupMultiplier vlmul,
3522             TailProcessing vta,
3523             auto vma,
3524             CsrName... kExtraCsrs>
OpVectorWidenvxw(uint8_t dst,uint8_t src1,ElementType arg2)3525   void OpVectorWidenvxw(uint8_t dst, uint8_t src1, ElementType arg2) {
3526     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3527                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3528       return OpVectorWiden<Intrinsic,
3529                            ElementType,
3530                            NumRegistersInvolvedForWideOperand(vlmul),
3531                            NumberOfRegistersInvolved(vlmul),
3532                            vta,
3533                            vma,
3534                            kExtraCsrs...>(dst, Vec{src1}, arg2, WideVec{dst});
3535     }
3536     return Undefined();
3537   }
3538 
3539   template <auto Intrinsic,
3540             typename ElementType,
3541             size_t kDestRegistersInvolved,
3542             size_t kRegistersInvolved,
3543             TailProcessing vta,
3544             auto vma,
3545             CsrName... kExtraCsrs,
3546             typename... Args>
OpVectorWiden(uint8_t dst,Args...args)3547   void OpVectorWiden(uint8_t dst, Args... args) {
3548     if constexpr (kDestRegistersInvolved == kRegistersInvolved) {
3549       static_assert(kDestRegistersInvolved == 1);
3550     } else {
3551       static_assert(kDestRegistersInvolved == 2 * kRegistersInvolved);
3552       // All normal (narrow) args must be aligned at kRegistersInvolved amount. We'll merge them
3553       // together and then do a combined check for all of them at once.
3554       uint8_t ored_args = OrValuesOnlyForType<Vec>(args...);
3555       // All wide args must be aligned at kRegistersInvolved amount. We'll merge them together and
3556       // then do a combined check for all of them at once.
3557       uint8_t ored_wide_args = OrValuesOnlyForType<WideVec>(args...) | dst;
3558       if (!IsAligned<kDestRegistersInvolved>(ored_wide_args) ||
3559           !IsAligned<kRegistersInvolved>(ored_args)) {
3560         return Undefined();
3561       }
3562     }
3563     // From RISC-V vectors manual: If destination EEW is greater than the source EEW, the source
3564     // EMUL is at least 1, [then overlap is permitted if ] the overlap is in the highest numbered
3565     // part of the destination register group (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a
3566     // source of v0, v2, or v4 is not).
3567     // Here only one forbidden combination is possible because of static_asserts above and we
3568     // detect and reject it.
3569     if (OrResultsOnlyForType<Vec>([dst](auto arg) { return arg.start_no == dst; }, args...)) {
3570       return Undefined();
3571     }
3572     size_t vstart = GetCsr<CsrName::kVstart>();
3573     size_t vl = GetCsr<CsrName::kVl>();
3574     SetCsr<CsrName::kVstart>(0);
3575     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3576     // vector register group, including that no tail elements are updated with agnostic values.
3577     if (vstart >= vl) [[unlikely]] {
3578       return;
3579     }
3580     auto mask = GetMaskForVectorOperations<vma>();
3581     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3582       SIMD128Register result(state_->cpu.v[dst + 2 * index]);
3583       result = VectorMasking<WideType<ElementType>, vta, vma>(
3584           result,
3585           std::get<0>(Intrinsic(
3586               GetCsr<kExtraCsrs>()...,
3587               GetLowVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3588           vstart,
3589           vl,
3590           2 * index,
3591           mask);
3592       state_->cpu.v[dst + 2 * index] = result.Get<__uint128_t>();
3593       if constexpr (kDestRegistersInvolved > 1) {  // if lmul is one full register or more
3594         result.Set(state_->cpu.v[dst + 2 * index + 1]);
3595         result = VectorMasking<WideType<ElementType>, vta, vma>(
3596             result,
3597             std::get<0>(Intrinsic(
3598                 GetCsr<kExtraCsrs>()...,
3599                 GetHighVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3600             vstart,
3601             vl,
3602             2 * index + 1,
3603             mask);
3604         state_->cpu.v[dst + 2 * index + 1] = result.Get<__uint128_t>();
3605       }
3606     }
3607   }
3608 
3609   template <auto Intrinsic,
3610             typename ElementType,
3611             VectorRegisterGroupMultiplier vlmul,
3612             TailProcessing vta,
3613             auto vma,
3614             CsrName... kExtraCsrs>
OpVectorvx(uint8_t dst,uint8_t src1,ElementType arg2)3615   void OpVectorvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3616     return OpVectorSameWidth<Intrinsic,
3617                              ElementType,
3618                              NumberOfRegistersInvolved(vlmul),
3619                              vta,
3620                              vma,
3621                              kExtraCsrs...>(dst, Vec{src1}, arg2);
3622   }
3623 
3624   template <auto Intrinsic,
3625             typename ElementType,
3626             size_t kRegistersInvolved,
3627             TailProcessing vta,
3628             auto vma,
3629             CsrName... kExtraCsrs,
3630             typename... Args>
OpVectorSameWidth(uint8_t dst,Args...args)3631   void OpVectorSameWidth(uint8_t dst, Args... args) {
3632     // All args must be aligned at kRegistersInvolved amount. We'll merge them
3633     // together and then do a combined check for all of them at once.
3634     if (!IsAligned<kRegistersInvolved>(OrValuesOnlyForType<Vec>(args...) | dst)) {
3635       return Undefined();
3636     }
3637     size_t vstart = GetCsr<CsrName::kVstart>();
3638     size_t vl = GetCsr<CsrName::kVl>();
3639     SetCsr<CsrName::kVstart>(0);
3640     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3641     // vector register group, including that no tail elements are updated with agnostic values.
3642     if (vstart >= vl) [[unlikely]] {
3643       return;
3644     }
3645     auto mask = GetMaskForVectorOperations<vma>();
3646     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3647       SIMD128Register result(state_->cpu.v[dst + index]);
3648       result = VectorMasking<ElementType, vta, vma>(
3649           result,
3650           std::get<0>(Intrinsic(
3651               GetCsr<kExtraCsrs>()...,
3652               GetVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3653           vstart,
3654           vl,
3655           index,
3656           mask);
3657       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3658     }
3659   }
3660 
3661   template <auto Intrinsic,
3662             typename TargetElementType,
3663             VectorRegisterGroupMultiplier vlmul,
3664             TailProcessing vta,
3665             auto vma,
3666             CsrName... kExtraCsrs>
OpVectorNarroww(uint8_t dst,uint8_t src)3667   void OpVectorNarroww(uint8_t dst, uint8_t src) {
3668     if constexpr (sizeof(TargetElementType) < sizeof(Int64) &&
3669                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3670       return OpVectorNarrow<Intrinsic,
3671                             TargetElementType,
3672                             NumberOfRegistersInvolved(vlmul),
3673                             NumRegistersInvolvedForWideOperand(vlmul),
3674                             vta,
3675                             vma,
3676                             kExtraCsrs...>(dst, WideVec{src});
3677     }
3678     return Undefined();
3679   }
3680 
3681   // SEW = 2*SEW op SEW
3682   template <auto Intrinsic,
3683             typename ElementType,
3684             VectorRegisterGroupMultiplier vlmul,
3685             TailProcessing vta,
3686             auto vma,
3687             CsrName... kExtraCsrs>
OpVectorNarrowwx(uint8_t dst,uint8_t src1,ElementType arg2)3688   void OpVectorNarrowwx(uint8_t dst, uint8_t src1, ElementType arg2) {
3689     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3690                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3691       return OpVectorNarrow<Intrinsic,
3692                             ElementType,
3693                             NumberOfRegistersInvolved(vlmul),
3694                             NumRegistersInvolvedForWideOperand(vlmul),
3695                             vta,
3696                             vma,
3697                             kExtraCsrs...>(dst, WideVec{src1}, arg2);
3698     }
3699     return Undefined();
3700   }
3701 
3702   // SEW = 2*SEW op SEW
3703   template <auto Intrinsic,
3704             typename ElementType,
3705             VectorRegisterGroupMultiplier vlmul,
3706             TailProcessing vta,
3707             auto vma,
3708             CsrName... kExtraCsrs>
OpVectorNarrowwv(uint8_t dst,uint8_t src1,uint8_t src2)3709   void OpVectorNarrowwv(uint8_t dst, uint8_t src1, uint8_t src2) {
3710     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3711                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3712       return OpVectorNarrow<Intrinsic,
3713                             ElementType,
3714                             NumberOfRegistersInvolved(vlmul),
3715                             NumRegistersInvolvedForWideOperand(vlmul),
3716                             vta,
3717                             vma,
3718                             kExtraCsrs...>(dst, WideVec{src1}, Vec{src2});
3719     }
3720     return Undefined();
3721   }
3722 
3723   template <auto Intrinsic,
3724             typename ElementType,
3725             size_t kRegistersInvolved,
3726             size_t kWideSrcRegistersInvolved,
3727             TailProcessing vta,
3728             auto vma,
3729             CsrName... kExtraCsrs,
3730             typename... Args>
OpVectorNarrow(uint8_t dst,Args...args)3731   void OpVectorNarrow(uint8_t dst, Args... args) {
3732     if constexpr (kWideSrcRegistersInvolved == kRegistersInvolved) {
3733       static_assert(kWideSrcRegistersInvolved == 1);
3734     } else {
3735       // All normal (narrow) args must be aligned at kRegistersInvolved amount. We'll merge them
3736       // together and then do a combined check for all of them at once.
3737       uint8_t ored_args = OrValuesOnlyForType<Vec>(args...) | dst;
3738       // All wide args must be aligned at kWideSrcRegistersInvolved amount. We'll merge them
3739       // together and then do a combined check for all of them at once.
3740       uint8_t ored_wide_args = OrValuesOnlyForType<WideVec>(args...);
3741       if (!IsAligned<kWideSrcRegistersInvolved>(ored_wide_args) ||
3742           !IsAligned<kRegistersInvolved>(ored_args)) {
3743         return Undefined();
3744       }
3745       static_assert(kWideSrcRegistersInvolved == 2 * kRegistersInvolved);
3746       // From RISC-V vectors manual: If destination EEW is smaller than the source EEW, [then
3747       // overlap is permitted if] the overlap is in the lowest-numbered part of the source register
3748       // group (e.g., when LMUL=1, vnsrl.wi v0, v0, 3 is legal, but a destination of v1 is not).
3749       // We only have one possible invalid value here because of alignment requirements.
3750       if (OrResultsOnlyForType<Vec>(
3751               [dst](auto arg) { return arg.start_no == dst + kRegistersInvolved; }, args...)) {
3752         return Undefined();
3753       }
3754     }
3755     size_t vstart = GetCsr<CsrName::kVstart>();
3756     size_t vl = GetCsr<CsrName::kVl>();
3757     SetCsr<CsrName::kVstart>(0);
3758     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3759     // vector register group, including that no tail elements are updated with agnostic values.
3760     if (vstart >= vl) [[unlikely]] {
3761       return;
3762     }
3763     auto mask = GetMaskForVectorOperations<vma>();
3764     for (size_t index = 0; index < kRegistersInvolved; index++) {
3765       SIMD128Register orig_result(state_->cpu.v[dst + index]);
3766       SIMD128Register intrinsic_result = std::get<0>(
3767           Intrinsic(GetCsr<kExtraCsrs>()...,
3768                     GetLowVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...));
3769       if constexpr (kWideSrcRegistersInvolved > 1) {
3770         SIMD128Register result_high = std::get<0>(Intrinsic(
3771             GetCsr<kExtraCsrs>()...,
3772             GetHighVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...));
3773         intrinsic_result = std::get<0>(
3774             intrinsics::VMergeBottomHalfToTop<ElementType>(intrinsic_result, result_high));
3775       }
3776       auto result = VectorMasking<ElementType, vta, vma>(
3777           orig_result, intrinsic_result, vstart, vl, index, mask);
3778       state_->cpu.v[dst + index] = result.template Get<__uint128_t>();
3779     }
3780   }
3781 
3782   template <auto Intrinsic,
3783             typename DestElementType,
3784             const uint8_t kFactor,
3785             VectorRegisterGroupMultiplier vlmul,
3786             TailProcessing vta,
3787             auto vma>
OpVectorVXUnary0(uint8_t dst,uint8_t src)3788   void OpVectorVXUnary0(uint8_t dst, uint8_t src) {
3789     static_assert(kFactor == 2 || kFactor == 4 || kFactor == 8);
3790     constexpr size_t kDestRegistersInvolved = NumberOfRegistersInvolved(vlmul);
3791     constexpr size_t kSourceRegistersInvolved = (kDestRegistersInvolved / kFactor) ?: 1;
3792     if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSourceRegistersInvolved>(src)) {
3793       return Undefined();
3794     }
3795     size_t vstart = GetCsr<CsrName::kVstart>();
3796     size_t vl = GetCsr<CsrName::kVl>();
3797     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3798     // vector register group, including that no tail elements are updated with agnostic values.
3799     if (vstart >= vl) [[unlikely]] {
3800       SetCsr<CsrName::kVstart>(0);
3801       return;
3802     }
3803     auto mask = GetMaskForVectorOperations<vma>();
3804     for (size_t dst_index = 0; dst_index < kDestRegistersInvolved; dst_index++) {
3805       size_t src_index = dst_index / kFactor;
3806       size_t src_elem = dst_index % kFactor;
3807       SIMD128Register result{state_->cpu.v[dst + dst_index]};
3808       SIMD128Register arg{state_->cpu.v[src + src_index] >> ((128 / kFactor) * src_elem)};
3809 
3810       result = VectorMasking<DestElementType, vta, vma>(
3811           result, std::get<0>(Intrinsic(arg)), vstart, vl, dst_index, mask);
3812       state_->cpu.v[dst + dst_index] = result.Get<__uint128_t>();
3813     }
3814     SetCsr<CsrName::kVstart>(0);
3815   }
3816 
3817   template <auto Intrinsic,
3818             typename ElementType,
3819             VectorRegisterGroupMultiplier vlmul,
3820             TailProcessing vta,
3821             auto vma,
3822             CsrName... kExtraCsrs>
OpVectorvxv(uint8_t dst,uint8_t src1,ElementType arg2)3823   void OpVectorvxv(uint8_t dst, uint8_t src1, ElementType arg2) {
3824     return OpVectorSameWidth<Intrinsic,
3825                              ElementType,
3826                              NumberOfRegistersInvolved(vlmul),
3827                              vta,
3828                              vma,
3829                              kExtraCsrs...>(dst, Vec{src1}, arg2, Vec{dst});
3830   }
3831 
3832   template <auto Intrinsic,
3833             typename ElementType,
3834             VectorRegisterGroupMultiplier vlmul,
3835             TailProcessing vta,
3836             auto vma,
3837             typename... DstMaskType>
OpVectorx(uint8_t dst,ElementType arg2,DstMaskType...dst_mask)3838   void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
3839     return OpVectorx<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
3840         dst, arg2, dst_mask...);
3841   }
3842 
3843   template <auto Intrinsic,
3844             typename ElementType,
3845             size_t kRegistersInvolved,
3846             TailProcessing vta,
3847             auto vma,
3848             typename... DstMaskType>
OpVectorx(uint8_t dst,ElementType arg2,DstMaskType...dst_mask)3849   void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
3850     static_assert(sizeof...(dst_mask) <= 1);
3851     if (!IsAligned<kRegistersInvolved>(dst | (dst_mask | ... | 0))) {
3852       return Undefined();
3853     }
3854     size_t vstart = GetCsr<CsrName::kVstart>();
3855     size_t vl = GetCsr<CsrName::kVl>();
3856     SetCsr<CsrName::kVstart>(0);
3857     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3858     // vector register group, including that no tail elements are updated with agnostic values.
3859     if (vstart >= vl) [[unlikely]] {
3860       return;
3861     }
3862     auto mask = GetMaskForVectorOperations<vma>();
3863     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3864       SIMD128Register result(state_->cpu.v[dst + index]);
3865       SIMD128Register result_mask;
3866       if constexpr (sizeof...(DstMaskType) == 0) {
3867         result_mask.Set(state_->cpu.v[dst + index]);
3868       } else {
3869         uint8_t dst_mask_unpacked[1] = {dst_mask...};
3870         result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
3871       }
3872       result = VectorMasking<ElementType, vta, vma>(
3873           result, std::get<0>(Intrinsic(arg2)), result_mask, vstart, vl, index, mask);
3874       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3875     }
3876   }
3877 
3878   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslideup(uint8_t dst,uint8_t src,Register offset)3879   void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
3880     return OpVectorslideup<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
3881         dst, src, offset);
3882   }
3883 
3884   template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorslideup(uint8_t dst,uint8_t src,Register offset)3885   void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
3886     constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
3887     if (!IsAligned<kRegistersInvolved>(dst | src)) {
3888       return Undefined();
3889     }
3890     // Source and destination must not intersect.
3891     if (dst < (src + kRegistersInvolved) && src < (dst + kRegistersInvolved)) {
3892       return Undefined();
3893     }
3894     size_t vstart = GetCsr<CsrName::kVstart>();
3895     size_t vl = GetCsr<CsrName::kVl>();
3896     SetCsr<CsrName::kVstart>(0);
3897     if (vstart >= vl) [[unlikely]] {
3898       // From 16.3: For all of the [slide instructions], if vstart >= vl, the
3899       // instruction performs no operation and leaves the destination vector
3900       // register unchanged.
3901       return;
3902     }
3903     auto mask = GetMaskForVectorOperations<vma>();
3904     // The slideup operation leaves Elements 0 through MAX(vstart, OFFSET) unchanged.
3905     //
3906     // From 16.3.1: Destination elements OFFSET through vl-1 are written if
3907     // unmasked and if OFFSET < vl.
3908     // However if OFFSET > vl, we still need to apply the tail policy (as
3909     // clarified in https://github.com/riscv/riscv-v-spec/issues/263). Given
3910     // that OFFSET could be well past vl we start at vl rather than OFFSET in
3911     // that case.
3912     const size_t start_elem_index = std::min(std::max(vstart, offset), vl);
3913     for (size_t index = start_elem_index / kElementsPerRegister; index < kRegistersInvolved;
3914          ++index) {
3915       SIMD128Register result(state_->cpu.v[dst + index]);
3916 
3917       // Arguments falling before the input group correspond to the first offset-amount
3918       // result elements, which must remain undisturbed. We zero-initialize them here,
3919       // but their values are eventually ignored by vstart masking in VectorMasking.
3920       ssize_t first_arg_disp = index - 1 - offset / kElementsPerRegister;
3921       SIMD128Register arg1 =
3922           (first_arg_disp < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp];
3923       SIMD128Register arg2 =
3924           (first_arg_disp + 1 < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp + 1];
3925 
3926       result =
3927           VectorMasking<ElementType, vta, vma>(result,
3928                                                std::get<0>(intrinsics::VectorSlideUp<ElementType>(
3929                                                    offset % kElementsPerRegister, arg1, arg2)),
3930                                                start_elem_index,
3931                                                vl,
3932                                                index,
3933                                                mask);
3934       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3935     }
3936   }
3937 
3938   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslide1up(uint8_t dst,uint8_t src,ElementType xval)3939   void OpVectorslide1up(uint8_t dst, uint8_t src, ElementType xval) {
3940     // Save the vstart before it's reset by vslideup.
3941     size_t vstart = GetCsr<CsrName::kVstart>();
3942     // Slide all the elements by one.
3943     OpVectorslideup<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src, 1);
3944     if (exception_raised_) {
3945       return;
3946     }
3947     if (vstart > 0) {
3948       // First element is not affected and should remain untouched.
3949       return;
3950     }
3951 
3952     // From 16.3.3: places the x register argument at location 0 of the
3953     // destination vector register group provided that element 0 is active,
3954     // otherwise the destination element update follows the current mask
3955     // agnostic/undisturbed policy.
3956     if constexpr (std::is_same_v<decltype(vma), intrinsics::InactiveProcessing>) {
3957       auto mask = GetMaskForVectorOperations<vma>();
3958       if (!(mask.template Get<uint8_t>(0) & 0x1)) {
3959         // The first element is masked. OpVectorslideup already applied the proper masking to it.
3960         return;
3961       }
3962     }
3963 
3964     SIMD128Register result = state_->cpu.v[dst];
3965     result.Set(xval, 0);
3966     state_->cpu.v[dst] = result.Get<__uint128_t>();
3967   }
3968 
3969   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslidedown(uint8_t dst,uint8_t src,Register offset)3970   void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
3971     return OpVectorslidedown<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
3972         dst, src, offset);
3973   }
3974 
3975   template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorslidedown(uint8_t dst,uint8_t src,Register offset)3976   void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
3977     constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
3978     if (!IsAligned<kRegistersInvolved>(dst | src)) {
3979       return Undefined();
3980     }
3981     size_t vstart = GetCsr<CsrName::kVstart>();
3982     size_t vl = GetCsr<CsrName::kVl>();
3983     SetCsr<CsrName::kVstart>(0);
3984     if (vstart >= vl) [[unlikely]] {
3985       // From 16.3: For all of the [slide instructions], if vstart >= vl, the
3986       // instruction performs no operation and leaves the destination vector
3987       // register unchanged.
3988       return;
3989     }
3990     auto mask = GetMaskForVectorOperations<vma>();
3991     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3992       SIMD128Register result(state_->cpu.v[dst + index]);
3993 
3994       size_t first_arg_disp = index + offset / kElementsPerRegister;
3995       SIMD128Register arg1 = (first_arg_disp >= kRegistersInvolved)
3996                                  ? SIMD128Register{0}
3997                                  : state_->cpu.v[src + first_arg_disp];
3998       SIMD128Register arg2 = (first_arg_disp + 1 >= kRegistersInvolved)
3999                                  ? SIMD128Register{0}
4000                                  : state_->cpu.v[src + first_arg_disp + 1];
4001 
4002       result =
4003           VectorMasking<ElementType, vta, vma>(result,
4004                                                std::get<0>(intrinsics::VectorSlideDown<ElementType>(
4005                                                    offset % kElementsPerRegister, arg1, arg2)),
4006                                                vstart,
4007                                                vl,
4008                                                index,
4009                                                mask);
4010       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
4011     }
4012   }
4013 
4014   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslide1down(uint8_t dst,uint8_t src,ElementType xval)4015   void OpVectorslide1down(uint8_t dst, uint8_t src, ElementType xval) {
4016     constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
4017     const size_t vl = GetCsr<CsrName::kVl>();
4018 
4019     // From 16.3.4: ... places the x register argument at location vl-1 in the
4020     // destination vector register, provided that element vl-1 is active,
4021     // otherwise the destination element is **unchanged** (emphasis added.)
4022     //
4023     // This means that element at vl-1 would not follow the Mask Agnostic policy
4024     // and would stay Unchanged when inactive. So we need to undo just this one
4025     // element if using agnostic masking.
4026     ElementType last_elem_value = xval;
4027     const size_t last_elem_register = (vl - 1) / kElementsPerRegister;
4028     const size_t last_elem_within_reg_pos = (vl - 1) % kElementsPerRegister;
4029     bool set_last_element = true;
4030     if constexpr (std::is_same_v<decltype(vma), intrinsics::InactiveProcessing>) {
4031       auto mask = GetMaskForVectorOperations<vma>();
4032       auto [mask_bits] =
4033           intrinsics::MaskForRegisterInSequence<ElementType>(mask, last_elem_register);
4034       using MaskType = decltype(mask_bits);
4035       if ((static_cast<MaskType::BaseType>(mask_bits) & (1 << last_elem_within_reg_pos)) == 0) {
4036         if constexpr (vma == intrinsics::InactiveProcessing::kUndisturbed) {
4037           // Element is inactive and the undisturbed policy will be followed,
4038           // just let Opvectorslidedown handle everything.
4039           set_last_element = false;
4040         } else {
4041           // Element is inactive and the agnostic policy will be followed, get
4042           // the original value to restore before it's changed by
4043           // the agnostic policy.
4044           SIMD128Register original = state_->cpu.v[dst + last_elem_register];
4045           last_elem_value = original.Get<ElementType>(last_elem_within_reg_pos);
4046         }
4047       }
4048     }
4049 
4050     // Slide all the elements by one.
4051     OpVectorslidedown<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src, 1);
4052     if (exception_raised_) {
4053       return;
4054     }
4055     if (!set_last_element) {
4056       return;
4057     }
4058 
4059     SIMD128Register result = state_->cpu.v[dst + last_elem_register];
4060     result.Set(last_elem_value, last_elem_within_reg_pos);
4061     state_->cpu.v[dst + last_elem_register] = result.Get<__uint128_t>();
4062   }
4063 
4064   // Helper function needed to generate bitmak result from non-bitmask inputs.
4065   // We are processing between 1 and 8 registers here and each register produces between 2 bits
4066   // (for 64 bit inputs) and 16 bits (for 8 bit inputs) bitmasks which are then combined into
4067   // final result (between 2 and 128 bits long).
4068   // Note that we are not handling tail here! These bits remain undefined and should be handled
4069   // later.
4070   // TODO(b/317757595): Add separate tests to verify the logic.
4071   template <typename ElementType, size_t kRegistersInvolved, typename Intrinsic>
CollectBitmaskResult(Intrinsic intrinsic)4072   SIMD128Register CollectBitmaskResult(Intrinsic intrinsic) {
4073     // We employ two distinct tactics to handle all possibilities:
4074     //   1. For 8bit/16bit types we get full UInt8/UInt16 result and thus use SIMD128Register.Set.
4075     //   2. For 32bit/64bit types we only get 2bit or 4bit from each call and thus need to use
4076     //      shifts to accumulate the result.
4077     //      But since each of up to 8 results is at most 4bits total bitmask is 32bit (or less).
4078     std::conditional_t<sizeof(ElementType) < sizeof(UInt32), SIMD128Register, UInt32>
4079         bitmask_result{};
4080     for (UInt32 index = UInt32{0}; index < UInt32(kRegistersInvolved); index += UInt32{1}) {
4081       const auto [raw_result] =
4082           intrinsics::SimdMaskToBitMask<ElementType>(std::get<0>(intrinsic(index)));
4083       if constexpr (sizeof(ElementType) < sizeof(Int32)) {
4084         bitmask_result.Set(raw_result, index);
4085       } else {
4086         constexpr UInt32 kElemNum =
4087             UInt32{static_cast<uint32_t>((sizeof(SIMD128Register) / sizeof(ElementType)))};
4088         bitmask_result |= UInt32(UInt8(raw_result)) << (index * kElemNum);
4089       }
4090     }
4091     return SIMD128Register(bitmask_result);
4092   }
4093 
Nop()4094   void Nop() {}
4095 
Undefined()4096   void Undefined() {
4097     UndefinedInsn(GetInsnAddr());
4098     // If there is a guest handler registered for SIGILL we'll delay its processing until the next
4099     // sync point (likely the main dispatching loop) due to enabled pending signals. Thus we must
4100     // ensure that insn_addr isn't automatically advanced in FinalizeInsn.
4101     exception_raised_ = true;
4102   }
4103 
4104   //
4105   // Guest state getters/setters.
4106   //
4107 
GetReg(uint8_t reg)4108   Register GetReg(uint8_t reg) const {
4109     CheckRegIsValid(reg);
4110     return state_->cpu.x[reg];
4111   }
4112 
GetRegOrZero(uint8_t reg)4113   Register GetRegOrZero(uint8_t reg) { return reg == 0 ? 0 : GetReg(reg); }
4114 
SetReg(uint8_t reg,Register value)4115   void SetReg(uint8_t reg, Register value) {
4116     if (exception_raised_) {
4117       // Do not produce side effects.
4118       return;
4119     }
4120     CheckRegIsValid(reg);
4121     state_->cpu.x[reg] = value;
4122   }
4123 
SetRegOrIgnore(uint8_t reg,Register value)4124   void SetRegOrIgnore(uint8_t reg, Register value) {
4125     if (reg != 0) {
4126       SetReg(reg, value);
4127     }
4128   }
4129 
GetFpReg(uint8_t reg)4130   FpRegister GetFpReg(uint8_t reg) const {
4131     CheckFpRegIsValid(reg);
4132     return state_->cpu.f[reg];
4133   }
4134 
4135   template <typename FloatType>
4136   FpRegister GetFRegAndUnboxNan(uint8_t reg);
4137 
4138   template <typename FloatType>
4139   void NanBoxAndSetFpReg(uint8_t reg, FpRegister value);
4140 
4141   //
4142   // Various helper methods.
4143   //
4144 
4145   template <CsrName kName>
GetCsr()4146   [[nodiscard]] Register GetCsr() const {
4147     return state_->cpu.*CsrFieldAddr<kName>;
4148   }
4149 
4150   template <CsrName kName>
SetCsr(Register arg)4151   void SetCsr(Register arg) {
4152     if (exception_raised_) {
4153       return;
4154     }
4155     state_->cpu.*CsrFieldAddr<kName> = arg & kCsrMask<kName>;
4156   }
4157 
GetImm(uint64_t imm)4158   [[nodiscard]] uint64_t GetImm(uint64_t imm) const { return imm; }
4159 
Copy(Register value)4160   [[nodiscard]] Register Copy(Register value) const { return value; }
4161 
GetInsnAddr()4162   [[nodiscard]] GuestAddr GetInsnAddr() const { return state_->cpu.insn_addr; }
4163 
FinalizeInsn(uint8_t insn_len)4164   void FinalizeInsn(uint8_t insn_len) {
4165     if (!branch_taken_ && !exception_raised_) {
4166       state_->cpu.insn_addr += insn_len;
4167     }
4168   }
4169 
4170 #include "berberis/intrinsics/interpreter_intrinsics_hooks-inl.h"
4171 
4172  private:
4173   template <typename DataType>
Load(const void * ptr)4174   Register Load(const void* ptr) {
4175     static_assert(std::is_integral_v<DataType>);
4176     CHECK(!exception_raised_);
4177     FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
4178     if (result.is_fault) {
4179       exception_raised_ = true;
4180       return {};
4181     }
4182     return static_cast<DataType>(result.value);
4183   }
4184 
4185   template <typename DataType>
Store(void * ptr,uint64_t data)4186   void Store(void* ptr, uint64_t data) {
4187     static_assert(std::is_integral_v<DataType>);
4188     CHECK(!exception_raised_);
4189     exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
4190   }
4191 
CheckShamtIsValid(int8_t shamt)4192   void CheckShamtIsValid(int8_t shamt) const {
4193     CHECK_GE(shamt, 0);
4194     CHECK_LT(shamt, 64);
4195   }
4196 
CheckShamt32IsValid(int8_t shamt)4197   void CheckShamt32IsValid(int8_t shamt) const {
4198     CHECK_GE(shamt, 0);
4199     CHECK_LT(shamt, 32);
4200   }
4201 
CheckRegIsValid(uint8_t reg)4202   void CheckRegIsValid(uint8_t reg) const {
4203     CHECK_GT(reg, 0u);
4204     CHECK_LE(reg, std::size(state_->cpu.x));
4205   }
4206 
CheckFpRegIsValid(uint8_t reg)4207   void CheckFpRegIsValid(uint8_t reg) const { CHECK_LT(reg, std::size(state_->cpu.f)); }
4208 
4209   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4210   SIMD128Register GetHighVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4211                                         size_t /*vstart*/,
4212                                         size_t /*vl*/,
4213                                         size_t index,
4214                                         MaskType /*mask*/) {
4215     return std::get<0>(intrinsics::VMovTopHalfToBottom<ElementType>(
4216         SIMD128Register{state_->cpu.v[src.start_no + index]}));
4217   }
4218 
4219   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4220   SIMD128Register GetHighVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,
4221                                         size_t /*vstart*/,
4222                                         size_t /*vl*/,
4223                                         size_t index,
4224                                         MaskType /*mask*/) {
4225     return SIMD128Register{state_->cpu.v[src.start_no + 2 * index + 1]};
4226   }
4227 
4228   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4229   ElementType GetHighVectorArgument(ElementType arg,
4230                                     size_t /*vstart*/,
4231                                     size_t /*vl*/,
4232                                     size_t /*index*/,
4233                                     MaskType /*mask*/) {
4234     return arg;
4235   }
4236 
4237   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4238   SIMD128Register GetLowVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4239                                        size_t /*vstart*/,
4240                                        size_t /*vl*/,
4241                                        size_t index,
4242                                        MaskType /*mask*/) {
4243     return SIMD128Register{state_->cpu.v[src.start_no + index]};
4244   }
4245 
4246   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4247   SIMD128Register GetLowVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,
4248                                        size_t /*vstart*/,
4249                                        size_t /*vl*/,
4250                                        size_t index,
4251                                        MaskType /*mask*/) {
4252     return SIMD128Register{state_->cpu.v[src.start_no + 2 * index]};
4253   }
4254 
4255   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4256   ElementType GetLowVectorArgument(ElementType arg,
4257                                    size_t /*vstart*/,
4258                                    size_t /*vl*/,
4259                                    size_t /*index*/,
4260                                    MaskType /*mask*/) {
4261     return arg;
4262   }
4263 
4264   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4265   SIMD128Register GetVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4266                                     size_t /*vstart*/,
4267                                     size_t /*vl*/,
4268                                     size_t index,
4269                                     MaskType /*mask*/) {
4270     return SIMD128Register{state_->cpu.v[src.start_no + index]};
4271   }
4272 
4273   template <typename ElementType,
4274             TailProcessing vta,
4275             auto vma,
4276             typename MaskType,
4277             auto kDefaultElement>
GetVectorArgument(Vec<kDefaultElement> src,size_t vstart,size_t vl,size_t index,MaskType mask)4278   SIMD128Register GetVectorArgument(Vec<kDefaultElement> src,
4279                                     size_t vstart,
4280                                     size_t vl,
4281                                     size_t index,
4282                                     MaskType mask) {
4283     return VectorMasking<kDefaultElement, vta, vma>(
4284         SIMD128Register{state_->cpu.v[src.start_no + index]}, vstart, vl, index, mask);
4285   }
4286 
4287   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4288   ElementType GetVectorArgument(ElementType arg,
4289                                 size_t /*vstart*/,
4290                                 size_t /*vl*/,
4291                                 size_t /*index*/,
4292                                 MaskType /*mask*/) {
4293     return arg;
4294   }
4295 
4296   template <bool kUseMasking>
4297   std::conditional_t<kUseMasking, SIMD128Register, intrinsics::NoInactiveProcessing>
GetMaskForVectorOperationsIfNeeded()4298   GetMaskForVectorOperationsIfNeeded() {
4299     if constexpr (kUseMasking) {
4300       return {state_->cpu.v[0]};
4301     } else {
4302       return intrinsics::NoInactiveProcessing{};
4303     }
4304   }
4305 
4306   template <auto vma>
4307   std::conditional_t<std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
4308                      intrinsics::NoInactiveProcessing,
4309                      SIMD128Register>
GetMaskForVectorOperations()4310   GetMaskForVectorOperations() {
4311     return GetMaskForVectorOperationsIfNeeded<
4312         !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>();
4313   }
4314 
4315   template <auto kDefaultElement, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register result,size_t vstart,size_t vl,size_t index,MaskType mask)4316   SIMD128Register VectorMasking(SIMD128Register result,
4317                                 size_t vstart,
4318                                 size_t vl,
4319                                 size_t index,
4320                                 MaskType mask) {
4321     return std::get<0>(intrinsics::VectorMasking<kDefaultElement, vta, vma>(
4322         result,
4323         vstart - index * (sizeof(SIMD128Register) / sizeof(kDefaultElement)),
4324         vl - index * (sizeof(SIMD128Register) / sizeof(kDefaultElement)),
4325         std::get<0>(
4326             intrinsics::MaskForRegisterInSequence<decltype(kDefaultElement)>(mask, index))));
4327   }
4328 
4329   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,size_t vstart,size_t vl,size_t index,MaskType mask)4330   SIMD128Register VectorMasking(SIMD128Register dest,
4331                                 SIMD128Register result,
4332                                 size_t vstart,
4333                                 size_t vl,
4334                                 size_t index,
4335                                 MaskType mask) {
4336     return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
4337         dest,
4338         result,
4339         vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4340         vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4341         std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
4342   }
4343 
4344   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,SIMD128Register result_mask,size_t vstart,size_t vl,size_t index,MaskType mask)4345   SIMD128Register VectorMasking(SIMD128Register dest,
4346                                 SIMD128Register result,
4347                                 SIMD128Register result_mask,
4348                                 size_t vstart,
4349                                 size_t vl,
4350                                 size_t index,
4351                                 MaskType mask) {
4352     return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
4353         dest,
4354         result,
4355         result_mask,
4356         vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4357         vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4358         std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
4359   }
4360 
4361   template <template <auto> typename ProcessType,
4362             auto kLambda =
4363                 [](auto packaged_value) {
4364                   auto [unpacked_value] = packaged_value;
4365                   return unpacked_value;
4366                 },
4367             auto kDefaultValue = false,
4368             typename... Args>
4369   [[nodiscard]] static constexpr auto OrValuesOnlyForType(Args... args) {
4370     return OrResultsOnlyForType<ProcessType, kDefaultValue>(kLambda, args...);
4371   }
4372 
4373   template <template <auto> typename ProcessTemplateType,
4374             auto kDefaultValue = false,
4375             typename Lambda,
4376             typename... Args>
OrResultsOnlyForType(Lambda lambda,Args...args)4377   [[nodiscard]] static constexpr auto OrResultsOnlyForType(Lambda lambda, Args... args) {
4378 #pragma GCC diagnostic push
4379 #pragma GCC diagnostic ignored "-Wbitwise-instead-of-logical"
4380     return ([lambda](auto arg) {
4381       if constexpr (IsTypeTemplateOf<std::decay_t<decltype(arg)>, ProcessTemplateType>) {
4382         return lambda(arg);
4383       } else {
4384         return kDefaultValue;
4385       }
4386     }(args) |
4387             ...);
4388 #pragma GCC diagnostic pop
4389   }
4390 
4391   template <template <auto> typename ProcessTemplateType, typename Lambda, typename... Args>
ProcessOnlyForType(Lambda lambda,Args...args)4392   static constexpr void ProcessOnlyForType(Lambda lambda, Args... args) {
4393     (
4394         [lambda](auto arg) {
4395           if constexpr (IsTypeTemplateOf<std::decay_t<decltype(arg)>, ProcessTemplateType>) {
4396             lambda(arg);
4397           }
4398         }(args),
4399         ...);
4400   }
4401 
4402   ThreadState* state_;
4403   bool branch_taken_;
4404   // This flag is set by illegal instructions and faulted memory accesses. The former must always
4405   // stop the playback of the current instruction, so we don't need to do anything special. The
4406   // latter may result in having more operations with side effects called before the end of the
4407   // current instruction:
4408   //   Load (faulted)    -> SetReg
4409   //   LoadFp (faulted)  -> NanBoxAndSetFpReg
4410   // If an exception is raised before these operations, we skip them. For all other operations with
4411   // side-effects we check that this flag is never raised.
4412   bool exception_raised_;
4413 };
4414 
4415 template <>
4416 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kCycle>() const {
4417   return CPUClockCount();
4418 }
4419 
4420 template <>
4421 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFCsr>() const {
4422   return FeGetExceptions() | (state_->cpu.frm << 5);
4423 }
4424 
4425 template <>
4426 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFFlags>() const {
4427   return FeGetExceptions();
4428 }
4429 
4430 template <>
4431 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVlenb>() const {
4432   return 16;
4433 }
4434 
4435 template <>
4436 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxrm>() const {
4437   return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11;
4438 }
4439 
4440 template <>
4441 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxsat>() const {
4442   return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> >> 2;
4443 }
4444 
4445 template <>
4446 void inline Interpreter::SetCsr<CsrName::kFCsr>(Register arg) {
4447   CHECK(!exception_raised_);
4448   FeSetExceptions(arg & 0b1'1111);
4449   arg = (arg >> 5) & kCsrMask<CsrName::kFrm>;
4450   state_->cpu.frm = arg;
4451   FeSetRound(arg);
4452 }
4453 
4454 template <>
4455 void inline Interpreter::SetCsr<CsrName::kFFlags>(Register arg) {
4456   CHECK(!exception_raised_);
4457   FeSetExceptions(arg & 0b1'1111);
4458 }
4459 
4460 template <>
4461 void inline Interpreter::SetCsr<CsrName::kFrm>(Register arg) {
4462   CHECK(!exception_raised_);
4463   arg &= kCsrMask<CsrName::kFrm>;
4464   state_->cpu.frm = arg;
4465   FeSetRound(arg);
4466 }
4467 
4468 template <>
4469 void inline Interpreter::SetCsr<CsrName::kVxrm>(Register arg) {
4470   CHECK(!exception_raised_);
4471   state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
4472       (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b100) | (arg & 0b11);
4473 }
4474 
4475 template <>
4476 void inline Interpreter::SetCsr<CsrName::kVxsat>(Register arg) {
4477   CHECK(!exception_raised_);
4478   state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
4479       (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11) | ((arg & 0b1) << 2);
4480 }
4481 
4482 template <>
4483 [[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float32>(
4484     uint8_t reg) {
4485   CheckFpRegIsValid(reg);
4486   FpRegister value = state_->cpu.f[reg];
4487   return UnboxNan<Float32>(value);
4488 }
4489 
4490 template <>
4491 [[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float64>(
4492     uint8_t reg) {
4493   CheckFpRegIsValid(reg);
4494   return state_->cpu.f[reg];
4495 }
4496 
4497 template <>
4498 void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float32>(uint8_t reg, FpRegister value) {
4499   if (exception_raised_) {
4500     // Do not produce side effects.
4501     return;
4502   }
4503   CheckFpRegIsValid(reg);
4504   state_->cpu.f[reg] = NanBox<Float32>(value);
4505 }
4506 
4507 template <>
4508 void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float64>(uint8_t reg, FpRegister value) {
4509   if (exception_raised_) {
4510     // Do not produce side effects.
4511     return;
4512   }
4513   CheckFpRegIsValid(reg);
4514   state_->cpu.f[reg] = value;
4515 }
4516 
4517 #ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS
4518 template <>
4519 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadIndexedArgs& args);
4520 template <>
4521 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadStrideArgs& args);
4522 template <>
4523 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadUnitStrideArgs& args);
4524 template <>
4525 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVfArgs& args);
4526 template <>
4527 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVvArgs& args);
4528 template <>
4529 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIViArgs& args);
4530 template <>
4531 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVvArgs& args);
4532 template <>
4533 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVxArgs& args);
4534 template <>
4535 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVvArgs& args);
4536 template <>
4537 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVxArgs& args);
4538 template <>
4539 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreIndexedArgs& args);
4540 template <>
4541 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreStrideArgs& args);
4542 template <>
4543 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreUnitStrideArgs& args);
4544 #endif
4545 
4546 }  // namespace berberis
4547