1 /*
2 * Copyright (C) 2023 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file excenaupt in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "berberis/interpreter/riscv64/interpreter.h"
18
19 #include <atomic>
20 #include <cfenv>
21 #include <cstdint>
22 #include <cstring>
23
24 #include "berberis/base/bit_util.h"
25 #include "berberis/base/checks.h"
26 #include "berberis/base/macros.h"
27 #include "berberis/decoder/riscv64/decoder.h"
28 #include "berberis/decoder/riscv64/semantics_player.h"
29 #include "berberis/guest_state/guest_addr.h"
30 #include "berberis/guest_state/guest_state.h"
31 #include "berberis/intrinsics/guest_cpu_flags.h" // ToHostRoundingMode
32 #include "berberis/intrinsics/intrinsics.h"
33 #include "berberis/intrinsics/intrinsics_float.h"
34 #include "berberis/intrinsics/riscv64/vector_intrinsics.h"
35 #include "berberis/intrinsics/simd_register.h"
36 #include "berberis/intrinsics/type_traits.h"
37 #include "berberis/kernel_api/run_guest_syscall.h"
38 #include "berberis/runtime_primitives/interpret_helpers.h"
39 #include "berberis/runtime_primitives/memory_region_reservation.h"
40 #include "berberis/runtime_primitives/recovery_code.h"
41
42 #include "faulty_memory_accesses.h"
43 #include "regs.h"
44
45 namespace berberis {
46
AqRlToStdMemoryOrder(bool aq,bool rl)47 inline constexpr std::memory_order AqRlToStdMemoryOrder(bool aq, bool rl) {
48 if (aq) {
49 if (rl) {
50 return std::memory_order_acq_rel;
51 } else {
52 return std::memory_order_acquire;
53 }
54 } else {
55 if (rl) {
56 return std::memory_order_release;
57 } else {
58 return std::memory_order_relaxed;
59 }
60 }
61 }
62
63 template <typename ConcreteType, template <auto> typename TemplateType>
64 inline constexpr bool IsTypeTemplateOf = false;
65
66 template <template <auto> typename TemplateType, auto Value>
67 inline constexpr bool IsTypeTemplateOf<TemplateType<Value>, TemplateType> = true;
68
69 class Interpreter {
70 public:
71 using CsrName = berberis::CsrName;
72 using Decoder = Decoder<SemanticsPlayer<Interpreter>>;
73 using Register = uint64_t;
74 using FpRegister = uint64_t;
75 using Float32 = intrinsics::Float32;
76 using Float64 = intrinsics::Float64;
77
Interpreter(ThreadState * state)78 explicit Interpreter(ThreadState* state)
79 : state_(state), branch_taken_(false), exception_raised_(false) {}
80
81 //
82 // Instruction implementations.
83 //
84
UpdateCsr(Decoder::CsrOpcode opcode,Register arg,Register csr)85 Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr) {
86 switch (opcode) {
87 case Decoder::CsrOpcode::kCsrrs:
88 return arg | csr;
89 case Decoder::CsrOpcode::kCsrrc:
90 return ~arg & csr;
91 default:
92 Undefined();
93 return {};
94 }
95 }
96
UpdateCsr(Decoder::CsrImmOpcode opcode,uint8_t imm,Register csr)97 Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr) {
98 return UpdateCsr(static_cast<Decoder::CsrOpcode>(opcode), imm, csr);
99 }
100
101 // Note: we prefer not to use C11/C++ atomic_thread_fence or even gcc/clang builtin
102 // __atomic_thread_fence because all these function rely on the fact that compiler never uses
103 // non-temporal loads and stores and only issue “mfence” when sequentially consistent ordering is
104 // requested. They never issue “lfence” or “sfence”.
105 // Instead we pull the page from Linux's kernel book and map read ordereding to “lfence”, write
106 // ordering to “sfence” and read-write ordering to “mfence”.
107 // This can be important in the future if we would start using nontemporal moves in manually
108 // created assembly code.
109 // Ordering affecting I/O devices is not relevant to user-space code thus we just ignore bits
110 // related to devices I/O.
Fence(Decoder::FenceOpcode,Register,bool sw,bool sr,bool,bool,bool pw,bool pr,bool,bool)111 void Fence(Decoder::FenceOpcode /*opcode*/,
112 Register /*src*/,
113 bool sw,
114 bool sr,
115 bool /*so*/,
116 bool /*si*/,
117 bool pw,
118 bool pr,
119 bool /*po*/,
120 bool /*pi*/) {
121 bool read_fence = sr | pr;
122 bool write_fence = sw | pw;
123 // Two types of fences (total store ordering fence and normal fence) are supposed to be
124 // processed differently, but only for the “read_fence && write_fence” case (otherwise total
125 // store ordering fence becomes normal fence for the “forward compatibility”), yet because x86
126 // doesn't distinguish between these two types of fences and since we are supposed to map all
127 // not-yet defined fences to normal fence (again, for the “forward compatibility”) it's Ok to
128 // just ignore opcode field.
129 if (read_fence) {
130 if (write_fence) {
131 asm volatile("mfence" ::: "memory");
132 } else {
133 asm volatile("lfence" ::: "memory");
134 }
135 } else if (write_fence) {
136 asm volatile("sfence" ::: "memory");
137 }
138 return;
139 }
140
141 template <typename IntType, bool aq, bool rl>
Lr(int64_t addr)142 Register Lr(int64_t addr) {
143 static_assert(std::is_integral_v<IntType>, "Lr: IntType must be integral");
144 static_assert(std::is_signed_v<IntType>, "Lr: IntType must be signed");
145 CHECK(!exception_raised_);
146 // Address must be aligned on size of IntType.
147 CHECK((addr % sizeof(IntType)) == 0ULL);
148 return MemoryRegionReservation::Load<IntType>(&state_->cpu, addr, AqRlToStdMemoryOrder(aq, rl));
149 }
150
151 template <typename IntType, bool aq, bool rl>
Sc(int64_t addr,IntType val)152 Register Sc(int64_t addr, IntType val) {
153 static_assert(std::is_integral_v<IntType>, "Sc: IntType must be integral");
154 static_assert(std::is_signed_v<IntType>, "Sc: IntType must be signed");
155 CHECK(!exception_raised_);
156 // Address must be aligned on size of IntType.
157 CHECK((addr % sizeof(IntType)) == 0ULL);
158 return static_cast<Register>(MemoryRegionReservation::Store<IntType>(
159 &state_->cpu, addr, val, AqRlToStdMemoryOrder(aq, rl)));
160 }
161
Op(Decoder::OpOpcode opcode,Register arg1,Register arg2)162 Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2) {
163 switch (opcode) {
164 case Decoder::OpOpcode::kAdd:
165 return Int64(arg1) + Int64(arg2);
166 case Decoder::OpOpcode::kSub:
167 return Int64(arg1) - Int64(arg2);
168 case Decoder::OpOpcode::kAnd:
169 return Int64(arg1) & Int64(arg2);
170 case Decoder::OpOpcode::kOr:
171 return Int64(arg1) | Int64(arg2);
172 case Decoder::OpOpcode::kXor:
173 return Int64(arg1) ^ Int64(arg2);
174 case Decoder::OpOpcode::kSll:
175 return Int64(arg1) << Int64(arg2);
176 case Decoder::OpOpcode::kSrl:
177 return UInt64(arg1) >> Int64(arg2);
178 case Decoder::OpOpcode::kSra:
179 return Int64(arg1) >> Int64(arg2);
180 case Decoder::OpOpcode::kSlt:
181 return Int64(arg1) < Int64(arg2) ? 1 : 0;
182 case Decoder::OpOpcode::kSltu:
183 return UInt64(arg1) < UInt64(arg2) ? 1 : 0;
184 case Decoder::OpOpcode::kMul:
185 return Int64(arg1) * Int64(arg2);
186 case Decoder::OpOpcode::kMulh:
187 return NarrowTopHalf(Widen(Int64(arg1)) * Widen(Int64(arg2)));
188 case Decoder::OpOpcode::kMulhsu:
189 return NarrowTopHalf(Widen(Int64(arg1)) * BitCastToSigned(Widen(UInt64(arg2))));
190 case Decoder::OpOpcode::kMulhu:
191 return NarrowTopHalf(Widen(UInt64(arg1)) * Widen(UInt64(arg2)));
192 case Decoder::OpOpcode::kAndn:
193 return Int64(arg1) & (~Int64(arg2));
194 case Decoder::OpOpcode::kOrn:
195 return Int64(arg1) | (~Int64(arg2));
196 case Decoder::OpOpcode::kXnor:
197 return ~(Int64(arg1) ^ Int64(arg2));
198 default:
199 Undefined();
200 return {};
201 }
202 }
203
Op32(Decoder::Op32Opcode opcode,Register arg1,Register arg2)204 Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2) {
205 switch (opcode) {
206 case Decoder::Op32Opcode::kAddw:
207 return Widen(TruncateTo<Int32>(arg1) + TruncateTo<Int32>(arg2));
208 case Decoder::Op32Opcode::kSubw:
209 return Widen(TruncateTo<Int32>(arg1) - TruncateTo<Int32>(arg2));
210 case Decoder::Op32Opcode::kSllw:
211 return Widen(TruncateTo<Int32>(arg1) << TruncateTo<Int32>(arg2));
212 case Decoder::Op32Opcode::kSrlw:
213 return Widen(BitCastToSigned(TruncateTo<UInt32>(arg1) >> TruncateTo<Int32>(arg2)));
214 case Decoder::Op32Opcode::kSraw:
215 return Widen(TruncateTo<Int32>(arg1) >> TruncateTo<Int32>(arg2));
216 case Decoder::Op32Opcode::kMulw:
217 return Widen(TruncateTo<Int32>(arg1) * TruncateTo<Int32>(arg2));
218 default:
219 Undefined();
220 return {};
221 }
222 }
223
Load(Decoder::LoadOperandType operand_type,Register arg,int16_t offset)224 Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset) {
225 void* ptr = ToHostAddr<void>(arg + offset);
226 switch (operand_type) {
227 case Decoder::LoadOperandType::k8bitUnsigned:
228 return Load<uint8_t>(ptr);
229 case Decoder::LoadOperandType::k16bitUnsigned:
230 return Load<uint16_t>(ptr);
231 case Decoder::LoadOperandType::k32bitUnsigned:
232 return Load<uint32_t>(ptr);
233 case Decoder::LoadOperandType::k64bit:
234 return Load<uint64_t>(ptr);
235 case Decoder::LoadOperandType::k8bitSigned:
236 return Load<int8_t>(ptr);
237 case Decoder::LoadOperandType::k16bitSigned:
238 return Load<int16_t>(ptr);
239 case Decoder::LoadOperandType::k32bitSigned:
240 return Load<int32_t>(ptr);
241 default:
242 Undefined();
243 return {};
244 }
245 }
246
247 template <typename DataType>
LoadFp(Register arg,int16_t offset)248 FpRegister LoadFp(Register arg, int16_t offset) {
249 static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
250 CHECK(!exception_raised_);
251 DataType* ptr = ToHostAddr<DataType>(arg + offset);
252 FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
253 if (result.is_fault) {
254 exception_raised_ = true;
255 return {};
256 }
257 return result.value;
258 }
259
OpImm(Decoder::OpImmOpcode opcode,Register arg,int16_t imm)260 Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm) {
261 switch (opcode) {
262 case Decoder::OpImmOpcode::kAddi:
263 return arg + int64_t{imm};
264 case Decoder::OpImmOpcode::kSlti:
265 return bit_cast<int64_t>(arg) < int64_t{imm} ? 1 : 0;
266 case Decoder::OpImmOpcode::kSltiu:
267 return arg < bit_cast<uint64_t>(int64_t{imm}) ? 1 : 0;
268 case Decoder::OpImmOpcode::kXori:
269 return arg ^ int64_t { imm };
270 case Decoder::OpImmOpcode::kOri:
271 return arg | int64_t{imm};
272 case Decoder::OpImmOpcode::kAndi:
273 return arg & int64_t{imm};
274 default:
275 Undefined();
276 return {};
277 }
278 }
279
Lui(int32_t imm)280 Register Lui(int32_t imm) { return int64_t{imm}; }
281
Auipc(int32_t imm)282 Register Auipc(int32_t imm) {
283 uint64_t pc = state_->cpu.insn_addr;
284 return pc + int64_t{imm};
285 }
286
OpImm32(Decoder::OpImm32Opcode opcode,Register arg,int16_t imm)287 Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm) {
288 switch (opcode) {
289 case Decoder::OpImm32Opcode::kAddiw:
290 return int32_t(arg) + int32_t{imm};
291 default:
292 Undefined();
293 return {};
294 }
295 }
296
297 // TODO(b/232598137): rework ecall to not take parameters explicitly.
Ecall(Register,Register,Register,Register,Register,Register,Register)298 Register Ecall(Register /* syscall_nr */,
299 Register /* arg0 */,
300 Register /* arg1 */,
301 Register /* arg2 */,
302 Register /* arg3 */,
303 Register /* arg4 */,
304 Register /* arg5 */) {
305 CHECK(!exception_raised_);
306 RunGuestSyscall(state_);
307 return state_->cpu.x[A0];
308 }
309
Slli(Register arg,int8_t imm)310 Register Slli(Register arg, int8_t imm) { return arg << imm; }
311
Srli(Register arg,int8_t imm)312 Register Srli(Register arg, int8_t imm) { return arg >> imm; }
313
Srai(Register arg,int8_t imm)314 Register Srai(Register arg, int8_t imm) { return bit_cast<int64_t>(arg) >> imm; }
315
ShiftImm32(Decoder::ShiftImm32Opcode opcode,Register arg,uint16_t imm)316 Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm) {
317 switch (opcode) {
318 case Decoder::ShiftImm32Opcode::kSlliw:
319 return int32_t(arg) << int32_t{imm};
320 case Decoder::ShiftImm32Opcode::kSrliw:
321 return bit_cast<int32_t>(uint32_t(arg) >> uint32_t{imm});
322 case Decoder::ShiftImm32Opcode::kSraiw:
323 return int32_t(arg) >> int32_t{imm};
324 default:
325 Undefined();
326 return {};
327 }
328 }
329
Rori(Register arg,int8_t shamt)330 Register Rori(Register arg, int8_t shamt) {
331 CheckShamtIsValid(shamt);
332 return (((uint64_t(arg) >> shamt)) | (uint64_t(arg) << (64 - shamt)));
333 }
334
Roriw(Register arg,int8_t shamt)335 Register Roriw(Register arg, int8_t shamt) {
336 CheckShamt32IsValid(shamt);
337 return int32_t(((uint32_t(arg) >> shamt)) | (uint32_t(arg) << (32 - shamt)));
338 }
339
Store(Decoder::MemoryDataOperandType operand_type,Register arg,int16_t offset,Register data)340 void Store(Decoder::MemoryDataOperandType operand_type,
341 Register arg,
342 int16_t offset,
343 Register data) {
344 void* ptr = ToHostAddr<void>(arg + offset);
345 switch (operand_type) {
346 case Decoder::MemoryDataOperandType::k8bit:
347 Store<uint8_t>(ptr, data);
348 break;
349 case Decoder::MemoryDataOperandType::k16bit:
350 Store<uint16_t>(ptr, data);
351 break;
352 case Decoder::MemoryDataOperandType::k32bit:
353 Store<uint32_t>(ptr, data);
354 break;
355 case Decoder::MemoryDataOperandType::k64bit:
356 Store<uint64_t>(ptr, data);
357 break;
358 default:
359 return Undefined();
360 }
361 }
362
363 template <typename DataType>
StoreFp(Register arg,int16_t offset,FpRegister data)364 void StoreFp(Register arg, int16_t offset, FpRegister data) {
365 static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
366 CHECK(!exception_raised_);
367 DataType* ptr = ToHostAddr<DataType>(arg + offset);
368 exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
369 }
370
CompareAndBranch(Decoder::BranchOpcode opcode,Register arg1,Register arg2,int16_t offset)371 void CompareAndBranch(Decoder::BranchOpcode opcode,
372 Register arg1,
373 Register arg2,
374 int16_t offset) {
375 bool cond_value;
376 switch (opcode) {
377 case Decoder::BranchOpcode::kBeq:
378 cond_value = arg1 == arg2;
379 break;
380 case Decoder::BranchOpcode::kBne:
381 cond_value = arg1 != arg2;
382 break;
383 case Decoder::BranchOpcode::kBltu:
384 cond_value = arg1 < arg2;
385 break;
386 case Decoder::BranchOpcode::kBgeu:
387 cond_value = arg1 >= arg2;
388 break;
389 case Decoder::BranchOpcode::kBlt:
390 cond_value = bit_cast<int64_t>(arg1) < bit_cast<int64_t>(arg2);
391 break;
392 case Decoder::BranchOpcode::kBge:
393 cond_value = bit_cast<int64_t>(arg1) >= bit_cast<int64_t>(arg2);
394 break;
395 default:
396 return Undefined();
397 }
398
399 if (cond_value) {
400 Branch(offset);
401 }
402 }
403
Branch(int32_t offset)404 void Branch(int32_t offset) {
405 CHECK(!exception_raised_);
406 state_->cpu.insn_addr += offset;
407 branch_taken_ = true;
408 }
409
BranchRegister(Register base,int16_t offset)410 void BranchRegister(Register base, int16_t offset) {
411 CHECK(!exception_raised_);
412 state_->cpu.insn_addr = (base + offset) & ~uint64_t{1};
413 branch_taken_ = true;
414 }
415
Fmv(FpRegister arg)416 FpRegister Fmv(FpRegister arg) { return arg; }
417
418 //
419 // V extensions.
420 //
421
422 using TailProcessing = intrinsics::TailProcessing;
423 using InactiveProcessing = intrinsics::InactiveProcessing;
424
425 enum class VectorSelectElementWidth {
426 k8bit = 0b000,
427 k16bit = 0b001,
428 k32bit = 0b010,
429 k64bit = 0b011,
430 kMaxValue = 0b111,
431 };
432
433 enum class VectorRegisterGroupMultiplier {
434 k1register = 0b000,
435 k2registers = 0b001,
436 k4registers = 0b010,
437 k8registers = 0b011,
438 kEigthOfRegister = 0b101,
439 kQuarterOfRegister = 0b110,
440 kHalfOfRegister = 0b111,
441 kMaxValue = 0b111,
442 };
443
NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul)444 static constexpr size_t NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul) {
445 switch (vlmul) {
446 case VectorRegisterGroupMultiplier::k2registers:
447 return 2;
448 case VectorRegisterGroupMultiplier::k4registers:
449 return 4;
450 case VectorRegisterGroupMultiplier::k8registers:
451 return 8;
452 default:
453 return 1;
454 }
455 }
456
NumRegistersInvolvedForWideOperand(VectorRegisterGroupMultiplier vlmul)457 static constexpr size_t NumRegistersInvolvedForWideOperand(VectorRegisterGroupMultiplier vlmul) {
458 switch (vlmul) {
459 case VectorRegisterGroupMultiplier::k1register:
460 return 2;
461 case VectorRegisterGroupMultiplier::k2registers:
462 return 4;
463 case VectorRegisterGroupMultiplier::k4registers:
464 return 8;
465 default:
466 return 1;
467 }
468 }
469
470 template <typename ElementType, VectorRegisterGroupMultiplier vlmul>
GetVlmax()471 static constexpr size_t GetVlmax() {
472 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
473 switch (vlmul) {
474 case VectorRegisterGroupMultiplier::k1register:
475 return kElementsCount;
476 case VectorRegisterGroupMultiplier::k2registers:
477 return 2 * kElementsCount;
478 case VectorRegisterGroupMultiplier::k4registers:
479 return 4 * kElementsCount;
480 case VectorRegisterGroupMultiplier::k8registers:
481 return 8 * kElementsCount;
482 case VectorRegisterGroupMultiplier::kEigthOfRegister:
483 return kElementsCount / 8;
484 case VectorRegisterGroupMultiplier::kQuarterOfRegister:
485 return kElementsCount / 4;
486 case VectorRegisterGroupMultiplier::kHalfOfRegister:
487 return kElementsCount / 2;
488 default:
489 return 0;
490 }
491 }
492
493 template <typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,ExtraArgs...extra_args)494 void OpVector(const VOpArgs& args, ExtraArgs... extra_args) {
495 // Note: whole register instructions are not dependent on vtype and are supposed to work even
496 // if vill is set! Handle them before processing other instructions.
497 // Note: other tupes of loads and store are not special and would be processed as usual.
498 // TODO(khim): Handle vstart properly.
499 if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs>) {
500 if (args.opcode == Decoder::VLUmOpOpcode::kVlXreXX) {
501 if (!IsPowerOf2(args.nf + 1)) {
502 return Undefined();
503 }
504 if ((args.dst & args.nf) != 0) {
505 return Undefined();
506 }
507 auto [src] = std::tuple{extra_args...};
508 __uint128_t* ptr = bit_cast<__uint128_t*>(src);
509 for (size_t index = 0; index <= args.nf; index++) {
510 state_->cpu.v[args.dst + index] = ptr[index];
511 }
512 return;
513 }
514 }
515
516 if constexpr (std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
517 if (args.opcode == Decoder::VSUmOpOpcode::kVsX) {
518 if (args.width != Decoder::MemoryDataOperandType::k8bit) {
519 return Undefined();
520 }
521 if (!IsPowerOf2(args.nf + 1)) {
522 return Undefined();
523 }
524 if ((args.data & args.nf) != 0) {
525 return Undefined();
526 }
527 auto [src] = std::tuple{extra_args...};
528 __uint128_t* ptr = bit_cast<__uint128_t*>(src);
529 for (size_t index = 0; index <= args.nf; index++) {
530 ptr[index] = state_->cpu.v[args.data + index];
531 }
532 return;
533 }
534 }
535
536 // RISC-V V extensions are using 8bit “opcode extension” vtype Csr to make sure 32bit encoding
537 // would be usable.
538 //
539 // Great care is made to ensure that vector code wouldn't need to change vtype Csr often (e.g.
540 // there are special mask instructions which allow one to manipulate on masks without the need
541 // to change the CPU mode.
542 //
543 // Currently we don't have support for multiple CPU mode in Berberis thus we can only handle
544 // these instrtuctions in the interpreter.
545 //
546 // TODO(b/300690740): develop and implement strategy which would allow us to support vector
547 // intrinsics not just in the interpreter. Move code from this function to semantics player.
548 Register vtype = GetCsr<CsrName::kVtype>();
549 if (static_cast<std::make_signed_t<Register>>(vtype) < 0) {
550 return Undefined();
551 }
552 if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
553 std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
554 std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
555 std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
556 std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
557 std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
558 switch (args.width) {
559 case Decoder::MemoryDataOperandType::k8bit:
560 return OpVector<UInt8>(args, vtype, extra_args...);
561 case Decoder::MemoryDataOperandType::k16bit:
562 return OpVector<UInt16>(args, vtype, extra_args...);
563 case Decoder::MemoryDataOperandType::k32bit:
564 return OpVector<UInt32>(args, vtype, extra_args...);
565 case Decoder::MemoryDataOperandType::k64bit:
566 return OpVector<UInt64>(args, vtype, extra_args...);
567 default:
568 return Undefined();
569 }
570 } else {
571 VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0x7);
572 if constexpr (std::is_same_v<VOpArgs, Decoder::VOpFVfArgs> ||
573 std::is_same_v<VOpArgs, Decoder::VOpFVvArgs>) {
574 switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
575 case VectorSelectElementWidth::k16bit:
576 if constexpr (sizeof...(extra_args) == 0) {
577 return OpVector<intrinsics::Float16>(args, vlmul, vtype);
578 } else {
579 return Undefined();
580 }
581 case VectorSelectElementWidth::k32bit:
582 return OpVector<Float32>(
583 args,
584 vlmul,
585 vtype,
586 std::get<0>(intrinsics::UnboxNan<Float32>(bit_cast<Float64>(extra_args)))...);
587 case VectorSelectElementWidth::k64bit:
588 // Note: if arguments are 64bit floats then we don't need to do any unboxing.
589 return OpVector<Float64>(args, vlmul, vtype, bit_cast<Float64>(extra_args)...);
590 default:
591 return Undefined();
592 }
593 } else {
594 switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
595 case VectorSelectElementWidth::k8bit:
596 return OpVector<UInt8>(args, vlmul, vtype, extra_args...);
597 case VectorSelectElementWidth::k16bit:
598 return OpVector<UInt16>(args, vlmul, vtype, extra_args...);
599 case VectorSelectElementWidth::k32bit:
600 return OpVector<UInt32>(args, vlmul, vtype, extra_args...);
601 case VectorSelectElementWidth::k64bit:
602 return OpVector<UInt64>(args, vlmul, vtype, extra_args...);
603 default:
604 return Undefined();
605 }
606 }
607 }
608 }
609
610 template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)611 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
612 auto vemul = Decoder::SignExtend<3>(vtype & 0b111);
613 vemul -= ((vtype >> 3) & 0b111); // Divide by SEW.
614 vemul +=
615 static_cast<std::underlying_type_t<decltype(args.width)>>(args.width); // Multiply by EEW.
616 if (vemul < -3 || vemul > 3) [[unlikely]] {
617 return Undefined();
618 }
619 // Note: whole register loads and stores treat args.nf differently, but they are processed
620 // separately above anyway, because they also ignore vtype and all the information in it!
621 // For other loads and stores affected number of registers (EMUL * NF) should be 8 or less.
622 if ((vemul > 0) && ((args.nf + 1) * (1 << vemul) > 8)) {
623 return Undefined();
624 }
625 return OpVector<ElementType>(
626 args, static_cast<VectorRegisterGroupMultiplier>(vemul & 0b111), vtype, extra_args...);
627 }
628
629 template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,VectorRegisterGroupMultiplier vlmul,Register vtype,ExtraArgs...extra_args)630 void OpVector(const VOpArgs& args,
631 VectorRegisterGroupMultiplier vlmul,
632 Register vtype,
633 ExtraArgs... extra_args) {
634 switch (vlmul) {
635 case VectorRegisterGroupMultiplier::k1register:
636 return OpVector<ElementType, VectorRegisterGroupMultiplier::k1register>(
637 args, vtype, extra_args...);
638 case VectorRegisterGroupMultiplier::k2registers:
639 return OpVector<ElementType, VectorRegisterGroupMultiplier::k2registers>(
640 args, vtype, extra_args...);
641 case VectorRegisterGroupMultiplier::k4registers:
642 return OpVector<ElementType, VectorRegisterGroupMultiplier::k4registers>(
643 args, vtype, extra_args...);
644 case VectorRegisterGroupMultiplier::k8registers:
645 return OpVector<ElementType, VectorRegisterGroupMultiplier::k8registers>(
646 args, vtype, extra_args...);
647 case VectorRegisterGroupMultiplier::kEigthOfRegister:
648 return OpVector<ElementType, VectorRegisterGroupMultiplier::kEigthOfRegister>(
649 args, vtype, extra_args...);
650 case VectorRegisterGroupMultiplier::kQuarterOfRegister:
651 return OpVector<ElementType, VectorRegisterGroupMultiplier::kQuarterOfRegister>(
652 args, vtype, extra_args...);
653 case VectorRegisterGroupMultiplier::kHalfOfRegister:
654 return OpVector<ElementType, VectorRegisterGroupMultiplier::kHalfOfRegister>(
655 args, vtype, extra_args...);
656 default:
657 return Undefined();
658 }
659 }
660
661 template <typename ElementType,
662 VectorRegisterGroupMultiplier vlmul,
663 typename VOpArgs,
664 typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)665 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
666 if (args.vm) {
667 return OpVector<ElementType, vlmul, intrinsics::NoInactiveProcessing{}>(
668 args, vtype, extra_args...);
669 }
670 if (vtype >> 7) {
671 return OpVector<ElementType, vlmul, InactiveProcessing::kAgnostic>(
672 args, vtype, extra_args...);
673 }
674 return OpVector<ElementType, vlmul, InactiveProcessing::kUndisturbed>(
675 args, vtype, extra_args...);
676 }
677
678 template <typename ElementType,
679 VectorRegisterGroupMultiplier vlmul,
680 auto vma,
681 typename VOpArgs,
682 typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)683 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
684 if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
685 std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
686 std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
687 std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
688 std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
689 std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
690 constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
691 // Note: whole register loads and stores treat args.nf differently, but they are processed
692 // separately above anyway, because they also ignore vtype and all the information in it!
693 switch (args.nf) {
694 case 0:
695 return OpVector<ElementType, 1, vlmul, vma>(args, vtype, extra_args...);
696 case 1:
697 if constexpr (kRegistersInvolved > 4) {
698 return Undefined();
699 } else {
700 return OpVector<ElementType, 2, vlmul, vma>(args, vtype, extra_args...);
701 }
702 case 2:
703 if constexpr (kRegistersInvolved > 2) {
704 return Undefined();
705 } else {
706 return OpVector<ElementType, 3, vlmul, vma>(args, vtype, extra_args...);
707 }
708 case 3:
709 if constexpr (kRegistersInvolved > 2) {
710 return Undefined();
711 } else {
712 return OpVector<ElementType, 4, vlmul, vma>(args, vtype, extra_args...);
713 }
714 case 4:
715 if constexpr (kRegistersInvolved > 1) {
716 return Undefined();
717 } else {
718 return OpVector<ElementType, 5, vlmul, vma>(args, vtype, extra_args...);
719 }
720 case 5:
721 if constexpr (kRegistersInvolved > 1) {
722 return Undefined();
723 } else {
724 return OpVector<ElementType, 6, vlmul, vma>(args, vtype, extra_args...);
725 }
726 case 6:
727 if constexpr (kRegistersInvolved > 1) {
728 return Undefined();
729 } else {
730 return OpVector<ElementType, 7, vlmul, vma>(args, vtype, extra_args...);
731 }
732 case 7:
733 if constexpr (kRegistersInvolved > 1) {
734 return Undefined();
735 } else {
736 return OpVector<ElementType, 8, vlmul, vma>(args, vtype, extra_args...);
737 }
738 }
739 } else {
740 if ((vtype >> 6) & 1) {
741 return OpVector<ElementType, vlmul, TailProcessing::kAgnostic, vma>(args, extra_args...);
742 }
743 return OpVector<ElementType, vlmul, TailProcessing::kUndisturbed, vma>(args, extra_args...);
744 }
745 }
746
747 template <typename ElementType,
748 size_t kSegmentSize,
749 VectorRegisterGroupMultiplier vlmul,
750 auto vma,
751 typename VOpArgs,
752 typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)753 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
754 // Indexed loads and stores have two operands with different ElementType's and lmul sizes,
755 // pass vtype to do further selection.
756 if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
757 std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs>) {
758 // Because we know that we are dealing with indexed loads and stores and wouldn't need to
759 // convert elmul to anything else we can immediately turn it into kIndexRegistersInvolved
760 // here.
761 if ((vtype >> 6) & 1) {
762 return OpVector<kSegmentSize,
763 ElementType,
764 NumberOfRegistersInvolved(vlmul),
765 TailProcessing::kAgnostic,
766 vma>(args, vtype, extra_args...);
767 }
768 return OpVector<kSegmentSize,
769 ElementType,
770 NumberOfRegistersInvolved(vlmul),
771 TailProcessing::kUndisturbed,
772 vma>(args, vtype, extra_args...);
773 } else {
774 // For other instruction we have parsed all the information from vtype and only need to pass
775 // args and extra_args.
776 if ((vtype >> 6) & 1) {
777 return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kAgnostic, vma>(
778 args, extra_args...);
779 }
780 return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kUndisturbed, vma>(
781 args, extra_args...);
782 }
783 }
784
785 template <size_t kSegmentSize,
786 typename IndexElementType,
787 size_t kIndexRegistersInvolved,
788 TailProcessing vta,
789 auto vma,
790 typename VOpArgs,
791 typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)792 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
793 VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0b111);
794 switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
795 case VectorSelectElementWidth::k8bit:
796 return OpVector<UInt8, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
797 args, vlmul, extra_args...);
798 case VectorSelectElementWidth::k16bit:
799 return OpVector<UInt16, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
800 args, vlmul, extra_args...);
801 case VectorSelectElementWidth::k32bit:
802 return OpVector<UInt32, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
803 args, vlmul, extra_args...);
804 case VectorSelectElementWidth::k64bit:
805 return OpVector<UInt64, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
806 args, vlmul, extra_args...);
807 default:
808 return Undefined();
809 }
810 }
811
812 template <typename DataElementType,
813 size_t kSegmentSize,
814 typename IndexElementType,
815 size_t kIndexRegistersInvolved,
816 TailProcessing vta,
817 auto vma,
818 typename VOpArgs,
819 typename... ExtraArgs>
OpVector(const VOpArgs & args,VectorRegisterGroupMultiplier vlmul,ExtraArgs...extra_args)820 void OpVector(const VOpArgs& args, VectorRegisterGroupMultiplier vlmul, ExtraArgs... extra_args) {
821 switch (vlmul) {
822 case VectorRegisterGroupMultiplier::k1register:
823 return OpVector<DataElementType,
824 VectorRegisterGroupMultiplier::k1register,
825 IndexElementType,
826 kSegmentSize,
827 kIndexRegistersInvolved,
828 vta,
829 vma>(args, extra_args...);
830 case VectorRegisterGroupMultiplier::k2registers:
831 return OpVector<DataElementType,
832 VectorRegisterGroupMultiplier::k2registers,
833 IndexElementType,
834 kSegmentSize,
835 kIndexRegistersInvolved,
836 vta,
837 vma>(args, extra_args...);
838 case VectorRegisterGroupMultiplier::k4registers:
839 return OpVector<DataElementType,
840 VectorRegisterGroupMultiplier::k4registers,
841 IndexElementType,
842 kSegmentSize,
843 kIndexRegistersInvolved,
844 vta,
845 vma>(args, extra_args...);
846 case VectorRegisterGroupMultiplier::k8registers:
847 return OpVector<DataElementType,
848 VectorRegisterGroupMultiplier::k8registers,
849 IndexElementType,
850 kSegmentSize,
851 kIndexRegistersInvolved,
852 vta,
853 vma>(args, extra_args...);
854 case VectorRegisterGroupMultiplier::kEigthOfRegister:
855 return OpVector<DataElementType,
856 VectorRegisterGroupMultiplier::kEigthOfRegister,
857 IndexElementType,
858 kSegmentSize,
859 kIndexRegistersInvolved,
860 vta,
861 vma>(args, extra_args...);
862 case VectorRegisterGroupMultiplier::kQuarterOfRegister:
863 return OpVector<DataElementType,
864 VectorRegisterGroupMultiplier::kQuarterOfRegister,
865 IndexElementType,
866 kSegmentSize,
867 kIndexRegistersInvolved,
868 vta,
869 vma>(args, extra_args...);
870 case VectorRegisterGroupMultiplier::kHalfOfRegister:
871 return OpVector<DataElementType,
872 VectorRegisterGroupMultiplier::kHalfOfRegister,
873 IndexElementType,
874 kSegmentSize,
875 kIndexRegistersInvolved,
876 vta,
877 vma>(args, extra_args...);
878 default:
879 return Undefined();
880 }
881 }
882
883 // CSR registers, that are permitted as an argument of strip-mining instrinsic.
884 using CsrName::kFrm;
885 using CsrName::kVxrm;
886 using CsrName::kVxsat;
887 // Argument of OpVectorXXX function is the number of vector register group.
888 template <auto DefaultElement = intrinsics::NoInactiveProcessing{}>
889 struct Vec {
890 uint8_t start_no;
891 };
892 // Vector argument 2x wide (for narrowing and widening instructions).
893 template <auto DefaultElement = intrinsics::NoInactiveProcessing{}>
894 struct WideVec {
895 uint8_t start_no;
896 };
897
898 template <typename DataElementType,
899 VectorRegisterGroupMultiplier vlmul,
900 typename IndexElementType,
901 size_t kSegmentSize,
902 size_t kIndexRegistersInvolved,
903 TailProcessing vta,
904 auto vma>
OpVector(const Decoder::VLoadIndexedArgs & args,Register src)905 void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
906 return OpVector<DataElementType,
907 kSegmentSize,
908 NumberOfRegistersInvolved(vlmul),
909 IndexElementType,
910 kIndexRegistersInvolved,
911 vta,
912 vma>(args, src);
913 }
914
915 template <typename DataElementType,
916 size_t kSegmentSize,
917 size_t kNumRegistersInGroup,
918 typename IndexElementType,
919 size_t kIndexRegistersInvolved,
920 TailProcessing vta,
921 auto vma>
OpVector(const Decoder::VLoadIndexedArgs & args,Register src)922 void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
923 if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
924 return Undefined();
925 }
926 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
927 alignas(alignof(SIMD128Register))
928 IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
929 memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
930 return OpVectorLoad<DataElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
931 args.dst, src, [&indexes](size_t index) { return indexes[index]; });
932 }
933
934 template <typename ElementType,
935 size_t kSegmentSize,
936 VectorRegisterGroupMultiplier vlmul,
937 TailProcessing vta,
938 auto vma>
OpVector(const Decoder::VLoadStrideArgs & args,Register src,Register stride)939 void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
940 return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(
941 args, src, stride);
942 }
943
944 template <typename ElementType,
945 size_t kSegmentSize,
946 size_t kNumRegistersInGroup,
947 TailProcessing vta,
948 auto vma>
OpVector(const Decoder::VLoadStrideArgs & args,Register src,Register stride)949 void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
950 return OpVectorLoad<ElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
951 args.dst, src, [stride](size_t index) { return stride * index; });
952 }
953
954 template <typename ElementType,
955 size_t kSegmentSize,
956 VectorRegisterGroupMultiplier vlmul,
957 TailProcessing vta,
958 auto vma>
OpVector(const Decoder::VLoadUnitStrideArgs & args,Register src)959 void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
960 return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(args,
961 src);
962 }
963
964 template <typename ElementType,
965 size_t kSegmentSize,
966 size_t kNumRegistersInGroup,
967 TailProcessing vta,
968 auto vma>
OpVector(const Decoder::VLoadUnitStrideArgs & args,Register src)969 void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
970 switch (args.opcode) {
971 case Decoder::VLUmOpOpcode::kVleXXff:
972 return OpVectorLoad<ElementType,
973 kSegmentSize,
974 kNumRegistersInGroup,
975 vta,
976 vma,
977 Decoder::VLUmOpOpcode::kVleXXff>(
978 args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
979 case Decoder::VLUmOpOpcode::kVleXX:
980 return OpVectorLoad<ElementType,
981 kSegmentSize,
982 kNumRegistersInGroup,
983 vta,
984 vma,
985 Decoder::VLUmOpOpcode::kVleXX>(
986 args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
987 case Decoder::VLUmOpOpcode::kVlm:
988 if constexpr (kSegmentSize == 1 &&
989 std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
990 return OpVectorLoad<UInt8,
991 1,
992 1,
993 TailProcessing::kAgnostic,
994 vma,
995 Decoder::VLUmOpOpcode::kVlm>(
996 args.dst, src, [](size_t index) { return index; });
997 }
998 return Undefined();
999 default:
1000 return Undefined();
1001 }
1002 }
1003
1004 // The strided version of segmented load sounds like something very convoluted and complicated
1005 // that no one may ever want to use, but it's not rare and may be illustrated with simple RGB
1006 // bitmap window.
1007 //
1008 // Suppose it's in memory like this (doubles are 8 bytes in size as per IEEE 754)):
1009 // {R: 0.01}{G: 0.11}{B: 0.21} {R: 1.01}{G: 1.11}{B: 1.21}, {R: 2.01}{G: 2.11}{B: 2.21}
1010 // {R:10.01}{G:10.11}{B:10.21} {R:11.01}{G:11.11}{B:11.21}, {R:12.01}{G:12.11}{B:12.21}
1011 // {R:20.01}{G:20.11}{B:20.21} {R:21.01}{G:21.11}{B:21.21}, {R:22.01}{G:22.11}{B:22.21}
1012 // {R:30.01}{G:30.11}{B:30.21} {R:31.01}{G:31.11}{B:31.21}, {R:32.01}{G:32.11}{B:32.21}
1013 // This is very tiny 3x4 image with 3 components: red, green, blue.
1014 //
1015 // Let's assume that x1 is loaded with address of first element and x2 with 72 (that's how much
1016 // one row of this image takes).
1017 //
1018 // Then we may use the following command to load values in memory (with LMUL = 2, ELEN = 4):
1019 // vlsseg3e64.v v0, (x1), x2
1020 //
1021 // They would be loaded like this:
1022 // v0: {R: 0.01}{R:10.01} (first group of 2 registers)
1023 // v1: {R:20.01}{R:30.01}
1024 // v2: {G: 0.11}{G:10.11} (second group of 2 registers)
1025 // v3: {G:20.11}{G:30.11}
1026 // v4: {B: 0.21}{B:10.21} (third group of 3 registers)
1027 // v5: {B:20.21}{B:30.21}
1028 // Now we have loaded a column from memory and all three colors are put into a different register
1029 // groups for further processing.
1030 template <typename ElementType,
1031 size_t kSegmentSize,
1032 size_t kNumRegistersInGroup,
1033 TailProcessing vta,
1034 auto vma,
1035 typename Decoder::VLUmOpOpcode opcode = typename Decoder::VLUmOpOpcode{},
1036 typename GetElementOffsetLambdaType>
1037 void OpVectorLoad(uint8_t dst, Register src, GetElementOffsetLambdaType GetElementOffset) {
1038 using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
1039 if (!IsAligned<kNumRegistersInGroup>(dst)) {
1040 return Undefined();
1041 }
1042 if (dst + kNumRegistersInGroup * kSegmentSize > 32) {
1043 return Undefined();
1044 }
1045 constexpr size_t kElementsCount = 16 / sizeof(ElementType);
1046 size_t vstart = GetCsr<CsrName::kVstart>();
1047 size_t vl = GetCsr<CsrName::kVl>();
1048 if constexpr (opcode == Decoder::VLUmOpOpcode::kVlm) {
1049 vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
1050 }
1051 // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
1052 // simplify the logic below.
1053 SetCsr<CsrName::kVstart>(0);
1054 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
1055 // vector register group, including that no tail elements are updated with agnostic values.
1056 if (vstart >= vl) [[unlikely]] {
1057 return;
1058 }
1059 if constexpr (vta == TailProcessing::kAgnostic) {
1060 vstart = std::min(vstart, vl);
1061 }
1062 // Note: within_group_id is the current register id within a register group. During one
1063 // iteration of this loop we compute results for all registers with the current id in all
1064 // groups. E.g. for the example above we'd compute v0, v2, v4 during the first iteration (id
1065 // within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
1066 // ensures that memory is always accessed in ordered fashion.
1067 std::array<SIMD128Register, kSegmentSize> result;
1068 char* ptr = ToHostAddr<char>(src);
1069 auto mask = GetMaskForVectorOperations<vma>();
1070 for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
1071 ++within_group_id) {
1072 // No need to continue if we have kUndisturbed vta strategy.
1073 if constexpr (vta == TailProcessing::kUndisturbed) {
1074 if (within_group_id * kElementsCount >= vl) {
1075 break;
1076 }
1077 }
1078 // If we have elements that won't be overwritten then load these from registers.
1079 // For interpreter we could have filled all the registers unconditionally but we'll want to
1080 // reuse this code JITs later.
1081 auto register_mask =
1082 std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
1083 auto full_mask = std::get<0>(intrinsics::FullMaskForRegister<ElementType>(mask));
1084 if (vstart ||
1085 (vl < (within_group_id + 1) * kElementsCount && vta == TailProcessing::kUndisturbed) ||
1086 !(std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing> ||
1087 static_cast<InactiveProcessing>(vma) != InactiveProcessing::kUndisturbed ||
1088 register_mask == full_mask)) {
1089 for (size_t field = 0; field < kSegmentSize; ++field) {
1090 result[field].Set(state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup]);
1091 }
1092 }
1093 // Read elements from memory, but only if there are any active ones.
1094 for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
1095 ++within_register_id) {
1096 size_t element_index = kElementsCount * within_group_id + within_register_id;
1097 // Stop if we reached the vl limit.
1098 if (vl <= element_index) {
1099 break;
1100 }
1101 // Don't touch masked-out elements.
1102 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1103 if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
1104 1 << within_register_id)}) == MaskType{0}) {
1105 continue;
1106 }
1107 }
1108 // Load segment from memory.
1109 for (size_t field = 0; field < kSegmentSize; ++field) {
1110 FaultyLoadResult mem_access_result =
1111 FaultyLoad(ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
1112 sizeof(ElementType));
1113 if (mem_access_result.is_fault) {
1114 // Documentation doesn't tell us what we are supposed to do to remaining elements when
1115 // access fault happens but let's trigger an exception and treat the remaining elements
1116 // using vta-specified strategy by simply just adjusting the vl.
1117 vl = element_index;
1118 if constexpr (opcode == Decoder::VLUmOpOpcode::kVleXXff) {
1119 // Fail-first load only triggers exceptions for the first element, otherwise it
1120 // changes vl to ensure that other operations would only process elements that are
1121 // successfully loaded.
1122 if (element_index == 0) [[unlikely]] {
1123 exception_raised_ = true;
1124 } else {
1125 // TODO(b/323994286): Write a test case to verify vl changes correctly.
1126 SetCsr<CsrName::kVl>(element_index);
1127 }
1128 } else {
1129 // Most load instructions set vstart to failing element which then may be processed
1130 // by exception handler.
1131 exception_raised_ = true;
1132 SetCsr<CsrName::kVstart>(element_index);
1133 }
1134 break;
1135 }
1136 result[field].template Set<ElementType>(static_cast<ElementType>(mem_access_result.value),
1137 within_register_id);
1138 }
1139 }
1140 // Lambda to generate tail mask. We don't want to call MakeBitmaskFromVl eagerly because it's
1141 // not needed, most of the time, and compiler couldn't eliminate access to mmap-backed memory.
1142 auto GetTailMask = [vl, within_group_id] {
1143 return std::get<0>(intrinsics::MakeBitmaskFromVl<ElementType>(
1144 (vl <= within_group_id * kElementsCount) ? 0 : vl - within_group_id * kElementsCount));
1145 };
1146 // If mask has inactive elements and InactiveProcessing::kAgnostic mode is used then set them
1147 // to ~0.
1148 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1149 if (register_mask != full_mask) {
1150 auto [simd_mask] =
1151 intrinsics::BitMaskToSimdMask<ElementType>(Int64{MaskType{register_mask}});
1152 for (size_t field = 0; field < kSegmentSize; ++field) {
1153 if constexpr (vma == InactiveProcessing::kAgnostic) {
1154 // vstart equal to zero is supposed to be exceptional. From RISV-V V manual (page 14):
1155 // The vstart CSR is writable by unprivileged code, but non-zero vstart values may
1156 // cause vector instructions to run substantially slower on some implementations, so
1157 // vstart should not be used by application programmers. A few vector instructions
1158 // cannot be executed with a non-zero vstart value and will raise an illegal
1159 // instruction exception as dened below.
1160 // TODO(b/300690740): decide whether to merge two cases after support for vectors in
1161 // heavy optimizer would be implemented.
1162 if (vstart) [[unlikely]] {
1163 SIMD128Register vstart_mask = std::get<0>(
1164 intrinsics::MakeBitmaskFromVl<ElementType>(vstart % kElementsCount));
1165 if constexpr (vta == TailProcessing::kAgnostic) {
1166 result[field] |= vstart_mask & ~simd_mask;
1167 } else if (vl < (within_group_id + 1) * kElementsCount) {
1168 result[field] |= vstart_mask & ~simd_mask & ~GetTailMask();
1169 } else {
1170 result[field] |= vstart_mask & ~simd_mask;
1171 }
1172 } else if constexpr (vta == TailProcessing::kAgnostic) {
1173 result[field] |= ~simd_mask;
1174 } else {
1175 if (vl < (within_group_id + 1) * kElementsCount) {
1176 result[field] |= ~simd_mask & ~GetTailMask();
1177 } else {
1178 result[field] |= ~simd_mask;
1179 }
1180 }
1181 }
1182 }
1183 }
1184 }
1185 // If we have tail elements and TailProcessing::kAgnostic mode then set them to ~0.
1186 if constexpr (vta == TailProcessing::kAgnostic) {
1187 for (size_t field = 0; field < kSegmentSize; ++field) {
1188 if (vl < (within_group_id + 1) * kElementsCount) {
1189 result[field] |= GetTailMask();
1190 }
1191 }
1192 }
1193 // Put values back into register file.
1194 for (size_t field = 0; field < kSegmentSize; ++field) {
1195 state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup] =
1196 result[field].template Get<__uint128_t>();
1197 }
1198 // Next group should be fully processed.
1199 vstart = 0;
1200 }
1201 }
1202
1203 // The vector register gather instructions read elements from src1 vector register group at
1204 // locations given by the second source vector src2 register group.
1205 // src1: element vector register.
1206 // GetElementIndex: universal lambda that returns index from src2,
1207 template <typename ElementType,
1208 VectorRegisterGroupMultiplier vlmul,
1209 TailProcessing vta,
1210 auto vma,
1211 typename GetElementIndexLambdaType>
OpVectorGather(uint8_t dst,uint8_t src1,GetElementIndexLambdaType GetElementIndex)1212 void OpVectorGather(uint8_t dst, uint8_t src1, GetElementIndexLambdaType GetElementIndex) {
1213 constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
1214 if (!IsAligned<kRegistersInvolved>(dst | src1)) {
1215 return Undefined();
1216 }
1217 // Source and destination must not overlap.
1218 if (dst < (src1 + kRegistersInvolved) && src1 < (dst + kRegistersInvolved)) {
1219 return Undefined();
1220 }
1221 constexpr size_t kElementsCount = 16 / sizeof(ElementType);
1222 constexpr size_t vlmax = GetVlmax<ElementType, vlmul>();
1223
1224 size_t vstart = GetCsr<CsrName::kVstart>();
1225 size_t vl = GetCsr<CsrName::kVl>();
1226 auto mask = GetMaskForVectorOperations<vma>();
1227 SetCsr<CsrName::kVstart>(0);
1228 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
1229 // vector register group, including that no tail elements are updated with agnostic values.
1230 if (vstart >= vl) [[unlikely]] {
1231 return;
1232 }
1233
1234 // Copy vlmul registers into array of elements, access elements of temporary array.
1235 alignas(alignof(SIMD128Register)) ElementType values[vlmax];
1236 memcpy(values, state_->cpu.v + src1, sizeof(values));
1237 // Fill dst first, resolve mask later.
1238 for (size_t index = vstart / kElementsCount; index < kRegistersInvolved; ++index) {
1239 SIMD128Register original_dst_value;
1240 SIMD128Register result{state_->cpu.v[dst + index]};
1241 for (size_t dst_element_index = vstart % kElementsCount; dst_element_index < kElementsCount;
1242 ++dst_element_index) {
1243 size_t src_element_index = GetElementIndex(index * kElementsCount + dst_element_index);
1244
1245 // If an element index is out of range ( vs1[i] >= VLMAX ) then zero is returned for the
1246 // element value.
1247 ElementType element_value = ElementType{0};
1248 if (src_element_index < vlmax) {
1249 element_value = values[src_element_index];
1250 }
1251 original_dst_value.Set<ElementType>(element_value, dst_element_index);
1252 }
1253
1254 // Apply mask and put result values into dst register.
1255 result =
1256 VectorMasking<ElementType, vta, vma>(result, original_dst_value, vstart, vl, index, mask);
1257 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
1258 // Next group should be fully processed.
1259 vstart = 0;
1260 }
1261 }
1262
1263 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpFVfArgs & args,ElementType arg2)1264 void OpVector(const Decoder::VOpFVfArgs& args, ElementType arg2) {
1265 using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
1266 if constexpr (sizeof(ElementType) == sizeof(Float32)) {
1267 // Keep cases sorted in opcode order to match RISC-V V manual.
1268 switch (args.opcode) {
1269 case Decoder::VOpFVfOpcode::kVfwaddvf:
1270 return OpVectorWidenvx<intrinsics::Vfwaddvf<ElementType>,
1271 ElementType,
1272 vlmul,
1273 vta,
1274 vma,
1275 kFrm>(args.dst, args.src1, arg2);
1276 case Decoder::VOpFVfOpcode::kVfwsubvf:
1277 return OpVectorWidenvx<intrinsics::Vfwsubvf<ElementType>,
1278 ElementType,
1279 vlmul,
1280 vta,
1281 vma,
1282 kFrm>(args.dst, args.src1, arg2);
1283 case Decoder::VOpFVfOpcode::kVfwmulvf:
1284 return OpVectorWidenvx<intrinsics::Vfwmulvf<ElementType>,
1285 ElementType,
1286 vlmul,
1287 vta,
1288 vma,
1289 kFrm>(args.dst, args.src1, arg2);
1290 case Decoder::VOpFVfOpcode::kVfwaddwf:
1291 return OpVectorWidenwx<intrinsics::Vfwaddwf<ElementType>,
1292 ElementType,
1293 vlmul,
1294 vta,
1295 vma,
1296 kFrm>(args.dst, args.src1, arg2);
1297 case Decoder::VOpFVfOpcode::kVfwsubwf:
1298 return OpVectorWidenwx<intrinsics::Vfwsubwf<ElementType>,
1299 ElementType,
1300 vlmul,
1301 vta,
1302 vma,
1303 kFrm>(args.dst, args.src1, arg2);
1304 case Decoder::VOpFVfOpcode::kVfwmaccvf:
1305 return OpVectorWidenvxw<intrinsics::Vfwmaccvf<ElementType>,
1306 ElementType,
1307 vlmul,
1308 vta,
1309 vma,
1310 kFrm>(args.dst, args.src1, arg2);
1311 case Decoder::VOpFVfOpcode::kVfwnmaccvf:
1312 return OpVectorWidenvxw<intrinsics::Vfwnmaccvf<ElementType>,
1313 ElementType,
1314 vlmul,
1315 vta,
1316 vma,
1317 kFrm>(args.dst, args.src1, arg2);
1318 case Decoder::VOpFVfOpcode::kVfwmsacvf:
1319 return OpVectorWidenvxw<intrinsics::Vfwmsacvf<ElementType>,
1320 ElementType,
1321 vlmul,
1322 vta,
1323 vma,
1324 kFrm>(args.dst, args.src1, arg2);
1325 case Decoder::VOpFVfOpcode::kVfwnmsacvf:
1326 return OpVectorWidenvxw<intrinsics::Vfwnmsacvf<ElementType>,
1327 ElementType,
1328 vlmul,
1329 vta,
1330 vma,
1331 kFrm>(args.dst, args.src1, arg2);
1332 default:
1333 break;
1334 }
1335 }
1336 // Keep cases sorted in opcode order to match RISC-V V manual.
1337 switch (args.opcode) {
1338 case Decoder::VOpFVfOpcode::kVfminvf:
1339 return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>(
1340 args.dst, args.src1, arg2);
1341 case Decoder::VOpFVfOpcode::kVfmaxvf:
1342 return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>(
1343 args.dst, args.src1, arg2);
1344 case Decoder::VOpFVfOpcode::kVfsgnjvf:
1345 return OpVectorvx<intrinsics::Vfsgnjvx<ElementType>, ElementType, vlmul, vta, vma>(
1346 args.dst, args.src1, arg2);
1347 case Decoder::VOpFVfOpcode::kVfsgnjnvf:
1348 return OpVectorvx<intrinsics::Vfsgnjnvx<ElementType>, ElementType, vlmul, vta, vma>(
1349 args.dst, args.src1, arg2);
1350 case Decoder::VOpFVfOpcode::kVfsgnjxvf:
1351 return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>(
1352 args.dst, args.src1, arg2);
1353 case Decoder::VOpFVfOpcode::kVfslide1upvf:
1354 return OpVectorslide1up<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
1355 case Decoder::VOpFVfOpcode::kVfslide1downvf:
1356 return OpVectorslide1down<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
1357 case Decoder::VOpFVfOpcode::kVfmvsf:
1358 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1359 return Undefined();
1360 }
1361 if (args.src1 != 0) {
1362 return Undefined();
1363 }
1364 return OpVectorVmvsx<ElementType, vta>(args.dst, arg2);
1365 case Decoder::VOpFVfOpcode::kVfmergevf:
1366 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1367 if (args.src1 != 0) {
1368 return Undefined();
1369 }
1370 return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(args.dst,
1371 arg2);
1372 } else {
1373 return OpVectorx<intrinsics::Vcopyx<ElementType>,
1374 ElementType,
1375 vlmul,
1376 vta,
1377 // Always use "undisturbed" value from source register.
1378 InactiveProcessing::kUndisturbed>(
1379 args.dst, arg2, /*dst_mask=*/args.src1);
1380 }
1381 case Decoder::VOpFVfOpcode::kVmfeqvf:
1382 return OpVectorToMaskvx<intrinsics::Vfeqvx<ElementType>, ElementType, vlmul, vma>(
1383 args.dst, args.src1, arg2);
1384 case Decoder::VOpFVfOpcode::kVmflevf:
1385 return OpVectorToMaskvx<intrinsics::Vflevx<ElementType>, ElementType, vlmul, vma>(
1386 args.dst, args.src1, arg2);
1387 case Decoder::VOpFVfOpcode::kVmfltvf:
1388 return OpVectorToMaskvx<intrinsics::Vfltvx<ElementType>, ElementType, vlmul, vma>(
1389 args.dst, args.src1, arg2);
1390 case Decoder::VOpFVfOpcode::kVmfnevf:
1391 return OpVectorToMaskvx<intrinsics::Vfnevx<ElementType>, ElementType, vlmul, vma>(
1392 args.dst, args.src1, arg2);
1393 case Decoder::VOpFVfOpcode::kVmfgtvf:
1394 return OpVectorToMaskvx<intrinsics::Vfgtvx<ElementType>, ElementType, vlmul, vma>(
1395 args.dst, args.src1, arg2);
1396 case Decoder::VOpFVfOpcode::kVmfgevf:
1397 return OpVectorToMaskvx<intrinsics::Vfgevx<ElementType>, ElementType, vlmul, vma>(
1398 args.dst, args.src1, arg2);
1399 case Decoder::VOpFVfOpcode::kVfdivvf:
1400 return OpVectorSameWidth<intrinsics::Vfdivvf<ElementType>,
1401 ElementType,
1402 NumberOfRegistersInvolved(vlmul),
1403 vta,
1404 vma,
1405 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1406 case Decoder::VOpFVfOpcode::kVfrdivvf:
1407 return OpVectorSameWidth<intrinsics::Vfrdivvf<ElementType>,
1408 ElementType,
1409 NumberOfRegistersInvolved(vlmul),
1410 vta,
1411 vma,
1412 kFrm>(
1413 args.dst,
1414 Vec<SignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x3f80'0000
1415 : 0x3ff0'0000'0000'0000}>{
1416 args.src1},
1417 arg2);
1418 case Decoder::VOpFVfOpcode::kVfmulvf:
1419 return OpVectorSameWidth<intrinsics::Vfmulvf<ElementType>,
1420 ElementType,
1421 NumberOfRegistersInvolved(vlmul),
1422 vta,
1423 vma,
1424 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1425 case Decoder::VOpFVfOpcode::kVfaddvf:
1426 return OpVectorSameWidth<intrinsics::Vfaddvf<ElementType>,
1427 ElementType,
1428 NumberOfRegistersInvolved(vlmul),
1429 vta,
1430 vma,
1431 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1432 case Decoder::VOpFVfOpcode::kVfsubvf:
1433 return OpVectorSameWidth<intrinsics::Vfsubvf<ElementType>,
1434 ElementType,
1435 NumberOfRegistersInvolved(vlmul),
1436 vta,
1437 vma,
1438 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1439 case Decoder::VOpFVfOpcode::kVfrsubvf:
1440 return OpVectorSameWidth<intrinsics::Vfrsubvf<ElementType>,
1441 ElementType,
1442 NumberOfRegistersInvolved(vlmul),
1443 vta,
1444 vma,
1445 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1446 case Decoder::VOpFVfOpcode::kVfmaccvf:
1447 return OpVectorvxv<intrinsics::Vfmaccvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1448 args.dst, args.src1, arg2);
1449 case Decoder::VOpFVfOpcode::kVfmsacvf:
1450 return OpVectorvxv<intrinsics::Vfmsacvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1451 args.dst, args.src1, arg2);
1452 case Decoder::VOpFVfOpcode::kVfmaddvf:
1453 return OpVectorvxv<intrinsics::Vfmaddvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1454 args.dst, args.src1, arg2);
1455 case Decoder::VOpFVfOpcode::kVfmsubvf:
1456 return OpVectorvxv<intrinsics::Vfmsubvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1457 args.dst, args.src1, arg2);
1458 case Decoder::VOpFVfOpcode::kVfnmaccvf:
1459 return OpVectorvxv<intrinsics::Vfnmaccvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1460 args.dst, args.src1, arg2);
1461 case Decoder::VOpFVfOpcode::kVfnmsacvf:
1462 return OpVectorvxv<intrinsics::Vfnmsacvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1463 args.dst, args.src1, arg2);
1464 case Decoder::VOpFVfOpcode::kVfnmaddvf:
1465 return OpVectorvxv<intrinsics::Vfnmaddvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1466 args.dst, args.src1, arg2);
1467 case Decoder::VOpFVfOpcode::kVfnmsubvf:
1468 return OpVectorvxv<intrinsics::Vfnmsubvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1469 args.dst, args.src1, arg2);
1470 default:
1471 return Undefined();
1472 }
1473 }
1474
1475 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpFVvArgs & args)1476 void OpVector(const Decoder::VOpFVvArgs& args) {
1477 using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
1478 using UnsignedType = Wrapping<std::make_unsigned_t<typename TypeTraits<ElementType>::Int>>;
1479 // We currently don't support Float16 operations, but conversion routines that deal with
1480 // double-width floats use these encodings to produce regular Float32 types.
1481 if constexpr (sizeof(ElementType) <= sizeof(Float32)) {
1482 using WideElementType = typename TypeTraits<ElementType>::Wide;
1483 // Keep cases sorted in opcode order to match RISC-V V manual.
1484 switch (args.opcode) {
1485 case Decoder::VOpFVvOpcode::kVFUnary0:
1486 switch (args.vfunary0_opcode) {
1487 case Decoder::VFUnary0Opcode::kVfwcvtfxuv:
1488 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1489 return intrinsics::Vfcvtv<WideElementType, UnsignedType>(FPFlags::DYN, frm, src);
1490 },
1491 UnsignedType,
1492 vlmul,
1493 vta,
1494 vma,
1495 kFrm>(args.dst, args.src1);
1496 case Decoder::VFUnary0Opcode::kVfwcvtfxv:
1497 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1498 return intrinsics::Vfcvtv<WideElementType, SignedType>(FPFlags::DYN, frm, src);
1499 },
1500 SignedType,
1501 vlmul,
1502 vta,
1503 vma,
1504 kFrm>(args.dst, args.src1);
1505 case Decoder::VFUnary0Opcode::kVfncvtxufw:
1506 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1507 return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::DYN, frm, src);
1508 },
1509 UnsignedType,
1510 vlmul,
1511 vta,
1512 vma,
1513 kFrm>(args.dst, args.src1);
1514 case Decoder::VFUnary0Opcode::kVfncvtxfw:
1515 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1516 return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::DYN, frm, src);
1517 },
1518 SignedType,
1519 vlmul,
1520 vta,
1521 vma,
1522 kFrm>(args.dst, args.src1);
1523 case Decoder::VFUnary0Opcode::kVfncvtrtzxufw:
1524 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1525 return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::RTZ, frm, src);
1526 },
1527 UnsignedType,
1528 vlmul,
1529 vta,
1530 vma,
1531 kFrm>(args.dst, args.src1);
1532 case Decoder::VFUnary0Opcode::kVfncvtrtzxfw:
1533 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1534 return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::RTZ, frm, src);
1535 },
1536 SignedType,
1537 vlmul,
1538 vta,
1539 vma,
1540 kFrm>(args.dst, args.src1);
1541 default:
1542 break; // Make compiler happy.
1543 }
1544 break;
1545 default:
1546 break; // Make compiler happy.
1547 }
1548 }
1549 // Widening and narrowing opeation which take floating point “narrow” operand may only work
1550 // correctly with Float32 input: Float16 is not supported yet, while Float64 input would produce
1551 // 128bit output which is currently reserver in RISC-V V.
1552 if constexpr (sizeof(ElementType) == sizeof(Float32)) {
1553 using WideElementType = WideType<ElementType>;
1554 using WideSignedType = WideType<SignedType>;
1555 using WideUnsignedType = WideType<UnsignedType>;
1556 // Keep cases sorted in opcode order to match RISC-V V manual.
1557 switch (args.opcode) {
1558 case Decoder::VOpFVvOpcode::kVfwaddvv:
1559 return OpVectorWidenvv<intrinsics::Vfwaddvv<ElementType>,
1560 ElementType,
1561 vlmul,
1562 vta,
1563 vma,
1564 kFrm>(args.dst, args.src1, args.src2);
1565 case Decoder::VOpFVvOpcode::kVfwsubvv:
1566 return OpVectorWidenvv<intrinsics::Vfwsubvv<ElementType>,
1567 ElementType,
1568 vlmul,
1569 vta,
1570 vma,
1571 kFrm>(args.dst, args.src1, args.src2);
1572 case Decoder::VOpFVvOpcode::kVfwmulvv:
1573 return OpVectorWidenvv<intrinsics::Vfwmulvv<ElementType>,
1574 ElementType,
1575 vlmul,
1576 vta,
1577 vma,
1578 kFrm>(args.dst, args.src1, args.src2);
1579 case Decoder::VOpFVvOpcode::kVfwaddwv:
1580 return OpVectorWidenwv<intrinsics::Vfwaddwv<ElementType>,
1581 ElementType,
1582 vlmul,
1583 vta,
1584 vma,
1585 kFrm>(args.dst, args.src1, args.src2);
1586 case Decoder::VOpFVvOpcode::kVfwsubwv:
1587 return OpVectorWidenwv<intrinsics::Vfwsubwv<ElementType>,
1588 ElementType,
1589 vlmul,
1590 vta,
1591 vma,
1592 kFrm>(args.dst, args.src1, args.src2);
1593 case Decoder::VOpFVvOpcode::kVfwmaccvv:
1594 return OpVectorWidenvvw<intrinsics::Vfwmaccvv<ElementType>,
1595 ElementType,
1596 vlmul,
1597 vta,
1598 vma,
1599 kFrm>(args.dst, args.src1, args.src2);
1600 case Decoder::VOpFVvOpcode::kVfwnmaccvv:
1601 return OpVectorWidenvvw<intrinsics::Vfwnmaccvv<ElementType>,
1602 ElementType,
1603 vlmul,
1604 vta,
1605 vma,
1606 kFrm>(args.dst, args.src1, args.src2);
1607 case Decoder::VOpFVvOpcode::kVfwmsacvv:
1608 return OpVectorWidenvvw<intrinsics::Vfwmsacvv<ElementType>,
1609 ElementType,
1610 vlmul,
1611 vta,
1612 vma,
1613 kFrm>(args.dst, args.src1, args.src2);
1614 case Decoder::VOpFVvOpcode::kVfwnmsacvv:
1615 return OpVectorWidenvvw<intrinsics::Vfwnmsacvv<ElementType>,
1616 ElementType,
1617 vlmul,
1618 vta,
1619 vma,
1620 kFrm>(args.dst, args.src1, args.src2);
1621 case Decoder::VOpFVvOpcode::kVFUnary0:
1622 switch (args.vfunary0_opcode) {
1623 case Decoder::VFUnary0Opcode::kVfwcvtxufv:
1624 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1625 return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::DYN, frm, src);
1626 },
1627 ElementType,
1628 vlmul,
1629 vta,
1630 vma,
1631 kFrm>(args.dst, args.src1);
1632 case Decoder::VFUnary0Opcode::kVfwcvtxfv:
1633 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1634 return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::DYN, frm, src);
1635 },
1636 ElementType,
1637 vlmul,
1638 vta,
1639 vma,
1640 kFrm>(args.dst, args.src1);
1641 case Decoder::VFUnary0Opcode::kVfwcvtffv:
1642 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1643 return intrinsics::Vfcvtv<WideElementType, ElementType>(FPFlags::DYN, frm, src);
1644 },
1645 ElementType,
1646 vlmul,
1647 vta,
1648 vma,
1649 kFrm>(args.dst, args.src1);
1650 case Decoder::VFUnary0Opcode::kVfwcvtrtzxufv:
1651 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1652 return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::RTZ, frm, src);
1653 },
1654 ElementType,
1655 vlmul,
1656 vta,
1657 vma,
1658 kFrm>(args.dst, args.src1);
1659 case Decoder::VFUnary0Opcode::kVfwcvtrtzxfv:
1660 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1661 return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::RTZ, frm, src);
1662 },
1663 ElementType,
1664 vlmul,
1665 vta,
1666 vma,
1667 kFrm>(args.dst, args.src1);
1668 case Decoder::VFUnary0Opcode::kVfncvtfxuw:
1669 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1670 return intrinsics::Vfcvtv<ElementType, WideUnsignedType>(FPFlags::DYN, frm, src);
1671 },
1672 ElementType,
1673 vlmul,
1674 vta,
1675 vma,
1676 kFrm>(args.dst, args.src1);
1677 case Decoder::VFUnary0Opcode::kVfncvtffw:
1678 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1679 return intrinsics::Vfcvtv<ElementType, WideElementType>(FPFlags::DYN, frm, src);
1680 },
1681 ElementType,
1682 vlmul,
1683 vta,
1684 vma,
1685 kFrm>(args.dst, args.src1);
1686 case Decoder::VFUnary0Opcode::kVfncvtfxw:
1687 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1688 return intrinsics::Vfcvtv<ElementType, WideSignedType>(FPFlags::DYN, frm, src);
1689 },
1690 ElementType,
1691 vlmul,
1692 vta,
1693 vma,
1694 kFrm>(args.dst, args.src1);
1695 default:
1696 break; // Make compiler happy.
1697 }
1698 break;
1699 default:
1700 break; // Make compiler happy.
1701 }
1702 }
1703 // If our ElementType is Float16 then “straight” operations are unsupported and we whouldn't try
1704 // instantiate any functions since this would lead to compilke-time error.
1705 if constexpr (sizeof(ElementType) >= sizeof(Float32)) {
1706 // Floating point IEEE 754 value -0.0 includes 1 top bit set and the other bits not set:
1707 // https://en.wikipedia.org/wiki/Signed_zero#Representations This is the exact same
1708 // representation minimum negative integer have in two's complement representation:
1709 // https://en.wikipedia.org/wiki/Two%27s_complement#Most_negative_number
1710 // Note: we pass filler elements as integers because `Float32`/`Float64` couldn't be template
1711 // parameters.
1712 constexpr SignedType kNegativeZero{std::numeric_limits<typename SignedType::BaseType>::min()};
1713 // Floating point IEEE 754 value +0.0 includes only zero bits, same as integer zero.
1714 constexpr SignedType kPositiveZero{};
1715 // Keep cases sorted in opcode order to match RISC-V V manual.
1716 switch (args.opcode) {
1717 case Decoder::VOpFVvOpcode::kVfredusumvs:
1718 // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1719 // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
1720 if (GetCsr<kFrm>() != FPFlags::RDN) {
1721 return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
1722 ElementType,
1723 vlmul,
1724 vta,
1725 vma,
1726 kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1727 } else {
1728 return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
1729 ElementType,
1730 vlmul,
1731 vta,
1732 vma,
1733 kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1734 }
1735 case Decoder::VOpFVvOpcode::kVfredosumvs:
1736 // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1737 // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
1738 if (GetCsr<kFrm>() != FPFlags::RDN) {
1739 return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
1740 ElementType,
1741 vlmul,
1742 vta,
1743 vma,
1744 kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1745 } else {
1746 return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
1747 ElementType,
1748 vlmul,
1749 vta,
1750 vma,
1751 kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1752 }
1753 case Decoder::VOpFVvOpcode::kVfminvv:
1754 return OpVectorvv<intrinsics::Vfminvv<ElementType>, ElementType, vlmul, vta, vma>(
1755 args.dst, args.src1, args.src2);
1756 case Decoder::VOpFVvOpcode::kVfredminvs:
1757 // For Vfredmin the identity element is +inf.
1758 return OpVectorvs<intrinsics::Vfredminvs<ElementType>, ElementType, vlmul, vta, vma>(
1759 args.dst,
1760 Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x7f80'0000
1761 : 0x7ff0'0000'0000'0000}>{
1762 args.src1},
1763 args.src2);
1764 case Decoder::VOpFVvOpcode::kVfmaxvv:
1765 return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>(
1766 args.dst, args.src1, args.src2);
1767 case Decoder::VOpFVvOpcode::kVfredmaxvs:
1768 // For Vfredmax the identity element is -inf.
1769 return OpVectorvs<intrinsics::Vfredmaxvs<ElementType>, ElementType, vlmul, vta, vma>(
1770 args.dst,
1771 Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0xff80'0000
1772 : 0xfff0'0000'0000'0000}>{
1773 args.src1},
1774 args.src2);
1775 case Decoder::VOpFVvOpcode::kVfsgnjvv:
1776 return OpVectorvv<intrinsics::Vfsgnjvv<ElementType>, ElementType, vlmul, vta, vma>(
1777 args.dst, args.src1, args.src2);
1778 case Decoder::VOpFVvOpcode::kVfsgnjnvv:
1779 return OpVectorvv<intrinsics::Vfsgnjnvv<ElementType>, ElementType, vlmul, vta, vma>(
1780 args.dst, args.src1, args.src2);
1781 case Decoder::VOpFVvOpcode::kVfsgnjxvv:
1782 return OpVectorvv<intrinsics::Vfsgnjxvv<ElementType>, ElementType, vlmul, vta, vma>(
1783 args.dst, args.src1, args.src2);
1784 case Decoder::VOpFVvOpcode::kVFUnary0:
1785 switch (args.vfunary0_opcode) {
1786 case Decoder::VFUnary0Opcode::kVfcvtxufv:
1787 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1788 return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::DYN, frm, src);
1789 },
1790 ElementType,
1791 vlmul,
1792 vta,
1793 vma,
1794 kFrm>(args.dst, args.src1);
1795 case Decoder::VFUnary0Opcode::kVfcvtxfv:
1796 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1797 return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::DYN, frm, src);
1798 },
1799 ElementType,
1800 vlmul,
1801 vta,
1802 vma,
1803 kFrm>(args.dst, args.src1);
1804 case Decoder::VFUnary0Opcode::kVfcvtfxuv:
1805 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1806 return intrinsics::Vfcvtv<ElementType, UnsignedType>(FPFlags::DYN, frm, src);
1807 },
1808 UnsignedType,
1809 vlmul,
1810 vta,
1811 vma,
1812 kFrm>(args.dst, args.src1);
1813 case Decoder::VFUnary0Opcode::kVfcvtfxv:
1814 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1815 return intrinsics::Vfcvtv<ElementType, SignedType>(FPFlags::DYN, frm, src);
1816 },
1817 SignedType,
1818 vlmul,
1819 vta,
1820 vma,
1821 kFrm>(args.dst, args.src1);
1822 case Decoder::VFUnary0Opcode::kVfcvtrtzxufv:
1823 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1824 return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::RTZ, frm, src);
1825 },
1826 ElementType,
1827 vlmul,
1828 vta,
1829 vma,
1830 kFrm>(args.dst, args.src1);
1831 case Decoder::VFUnary0Opcode::kVfcvtrtzxfv:
1832 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1833 return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::RTZ, frm, src);
1834 },
1835 ElementType,
1836 vlmul,
1837 vta,
1838 vma,
1839 kFrm>(args.dst, args.src1);
1840 default:
1841 break; // Make compiler happy.
1842 }
1843 break;
1844 case Decoder::VOpFVvOpcode::kVFUnary1:
1845 switch (args.vfunary1_opcode) {
1846 case Decoder::VFUnary1Opcode::kVfsqrtv:
1847 return OpVectorv<intrinsics::Vfsqrtv<ElementType>,
1848 ElementType,
1849 vlmul,
1850 vta,
1851 vma,
1852 kFrm>(args.dst, args.src1);
1853 break;
1854 case Decoder::VFUnary1Opcode::kVfrsqrt7v:
1855 return OpVectorv<intrinsics::Vfrsqrt7v<ElementType>, ElementType, vlmul, vta, vma>(
1856 args.dst, args.src1);
1857 break;
1858 case Decoder::VFUnary1Opcode::kVfclassv:
1859 return OpVectorv<intrinsics::Vfclassv<ElementType>, ElementType, vlmul, vta, vma>(
1860 args.dst, args.src1);
1861 break;
1862 default:
1863 break; // Make compiler happy.
1864 }
1865 break;
1866 case Decoder::VOpFVvOpcode::kVfmvfs:
1867 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1868 return Undefined();
1869 }
1870 if (args.src2 != 0) {
1871 return Undefined();
1872 }
1873 return OpVectorVmvfs<ElementType>(args.dst, args.src1);
1874 case Decoder::VOpFVvOpcode::kVmfeqvv:
1875 return OpVectorToMaskvv<intrinsics::Vfeqvv<ElementType>, ElementType, vlmul, vma>(
1876 args.dst, args.src1, args.src2);
1877 case Decoder::VOpFVvOpcode::kVmflevv:
1878 return OpVectorToMaskvv<intrinsics::Vflevv<ElementType>, ElementType, vlmul, vma>(
1879 args.dst, args.src1, args.src2);
1880 case Decoder::VOpFVvOpcode::kVmfltvv:
1881 return OpVectorToMaskvv<intrinsics::Vfltvv<ElementType>, ElementType, vlmul, vma>(
1882 args.dst, args.src1, args.src2);
1883 case Decoder::VOpFVvOpcode::kVmfnevv:
1884 return OpVectorToMaskvv<intrinsics::Vfnevv<ElementType>, ElementType, vlmul, vma>(
1885 args.dst, args.src1, args.src2);
1886 case Decoder::VOpFVvOpcode::kVfdivvv:
1887 return OpVectorSameWidth<intrinsics::Vfdivvv<ElementType>,
1888 ElementType,
1889 NumberOfRegistersInvolved(vlmul),
1890 vta,
1891 vma,
1892 kFrm>(
1893 args.dst,
1894 Vec<SignedType{}>{args.src1},
1895 Vec<SignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x3f80'0000
1896 : 0x3ff0'0000'0000'0000}>{
1897 args.src2});
1898 case Decoder::VOpFVvOpcode::kVfmulvv:
1899 return OpVectorSameWidth<intrinsics::Vfmulvv<ElementType>,
1900 ElementType,
1901 NumberOfRegistersInvolved(vlmul),
1902 vta,
1903 vma,
1904 kFrm>(
1905 args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
1906 case Decoder::VOpFVvOpcode::kVfaddvv:
1907 return OpVectorSameWidth<intrinsics::Vfaddvv<ElementType>,
1908 ElementType,
1909 NumberOfRegistersInvolved(vlmul),
1910 vta,
1911 vma,
1912 kFrm>(
1913 args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
1914 case Decoder::VOpFVvOpcode::kVfsubvv:
1915 return OpVectorSameWidth<intrinsics::Vfsubvv<ElementType>,
1916 ElementType,
1917 NumberOfRegistersInvolved(vlmul),
1918 vta,
1919 vma,
1920 kFrm>(
1921 args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
1922 case Decoder::VOpFVvOpcode::kVfmaccvv:
1923 return OpVectorvvv<intrinsics::Vfmaccvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1924 args.dst, args.src1, args.src2);
1925 case Decoder::VOpFVvOpcode::kVfmsacvv:
1926 return OpVectorvvv<intrinsics::Vfmsacvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1927 args.dst, args.src1, args.src2);
1928 case Decoder::VOpFVvOpcode::kVfmaddvv:
1929 return OpVectorvvv<intrinsics::Vfmaddvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1930 args.dst, args.src1, args.src2);
1931 case Decoder::VOpFVvOpcode::kVfmsubvv:
1932 return OpVectorvvv<intrinsics::Vfmsubvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1933 args.dst, args.src1, args.src2);
1934 case Decoder::VOpFVvOpcode::kVfnmaccvv:
1935 return OpVectorvvv<intrinsics::Vfnmaccvv<ElementType>,
1936 ElementType,
1937 vlmul,
1938 vta,
1939 vma,
1940 kFrm>(args.dst, args.src1, args.src2);
1941 case Decoder::VOpFVvOpcode::kVfnmsacvv:
1942 return OpVectorvvv<intrinsics::Vfnmsacvv<ElementType>,
1943 ElementType,
1944 vlmul,
1945 vta,
1946 vma,
1947 kFrm>(args.dst, args.src1, args.src2);
1948 case Decoder::VOpFVvOpcode::kVfnmaddvv:
1949 return OpVectorvvv<intrinsics::Vfnmaddvv<ElementType>,
1950 ElementType,
1951 vlmul,
1952 vta,
1953 vma,
1954 kFrm>(args.dst, args.src1, args.src2);
1955 case Decoder::VOpFVvOpcode::kVfnmsubvv:
1956 return OpVectorvvv<intrinsics::Vfnmsubvv<ElementType>,
1957 ElementType,
1958 vlmul,
1959 vta,
1960 vma,
1961 kFrm>(args.dst, args.src1, args.src2);
1962 default:
1963 break; // Make compiler happy.
1964 }
1965 }
1966 return Undefined();
1967 }
1968
1969 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIViArgs & args)1970 void OpVector(const Decoder::VOpIViArgs& args) {
1971 using SignedType = berberis::SignedType<ElementType>;
1972 using UnsignedType = berberis::UnsignedType<ElementType>;
1973 using SaturatingSignedType = SaturatingType<SignedType>;
1974 using SaturatingUnsignedType = SaturatingType<UnsignedType>;
1975 // Keep cases sorted in opcode order to match RISC-V V manual.
1976 switch (args.opcode) {
1977 case Decoder::VOpIViOpcode::kVaddvi:
1978 return OpVectorvx<intrinsics::Vaddvx<SignedType>, SignedType, vlmul, vta, vma>(
1979 args.dst, args.src, SignedType{args.imm});
1980 case Decoder::VOpIViOpcode::kVrsubvi:
1981 return OpVectorvx<intrinsics::Vrsubvx<SignedType>, SignedType, vlmul, vta, vma>(
1982 args.dst, args.src, SignedType{args.imm});
1983 case Decoder::VOpIViOpcode::kVandvi:
1984 return OpVectorvx<intrinsics::Vandvx<SignedType>, SignedType, vlmul, vta, vma>(
1985 args.dst, args.src, SignedType{args.imm});
1986 case Decoder::VOpIViOpcode::kVorvi:
1987 return OpVectorvx<intrinsics::Vorvx<SignedType>, SignedType, vlmul, vta, vma>(
1988 args.dst, args.src, SignedType{args.imm});
1989 case Decoder::VOpIViOpcode::kVxorvi:
1990 return OpVectorvx<intrinsics::Vxorvx<SignedType>, SignedType, vlmul, vta, vma>(
1991 args.dst, args.src, SignedType{args.imm});
1992 case Decoder::VOpIViOpcode::kVrgathervi:
1993 return OpVectorGather<ElementType, vlmul, vta, vma>(
1994 args.dst, args.src, [&args](size_t /*index*/) { return ElementType{args.uimm}; });
1995 case Decoder::VOpIViOpcode::kVmseqvi:
1996 return OpVectorToMaskvx<intrinsics::Vseqvx<SignedType>, SignedType, vlmul, vma>(
1997 args.dst, args.src, SignedType{args.imm});
1998 case Decoder::VOpIViOpcode::kVmsnevi:
1999 return OpVectorToMaskvx<intrinsics::Vsnevx<SignedType>, SignedType, vlmul, vma>(
2000 args.dst, args.src, SignedType{args.imm});
2001 case Decoder::VOpIViOpcode::kVmsleuvi:
2002 // Note: Vmsleu.vi actually have signed immediate which means that we first need to
2003 // expand it to the width of element as signed value and then bit-cast to unsigned.
2004 return OpVectorToMaskvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
2005 args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
2006 case Decoder::VOpIViOpcode::kVmslevi:
2007 return OpVectorToMaskvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
2008 args.dst, args.src, SignedType{args.imm});
2009 case Decoder::VOpIViOpcode::kVmsgtuvi:
2010 // Note: Vmsleu.vi actually have signed immediate which means that we first need to
2011 // expand it to the width of element as signed value and then bit-cast to unsigned.
2012 return OpVectorToMaskvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
2013 args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
2014 case Decoder::VOpIViOpcode::kVmsgtvi:
2015 return OpVectorToMaskvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
2016 args.dst, args.src, SignedType{args.imm});
2017 case Decoder::VOpIViOpcode::kVsadduvi:
2018 // Note: Vsaddu.vi actually have signed immediate which means that we first need to
2019 // expand it to the width of element as signed value and then bit-cast to unsigned.
2020 return OpVectorvx<intrinsics::Vaddvx<SaturatingUnsignedType>,
2021 SaturatingUnsignedType,
2022 vlmul,
2023 vta,
2024 vma>(
2025 args.dst, args.src, BitCastToUnsigned(SaturatingSignedType{args.imm}));
2026 case Decoder::VOpIViOpcode::kVsaddvi:
2027 return OpVectorvx<intrinsics::Vaddvx<SaturatingSignedType>,
2028 SaturatingSignedType,
2029 vlmul,
2030 vta,
2031 vma>(args.dst, args.src, SaturatingSignedType{args.imm});
2032 case Decoder::VOpIViOpcode::kVsllvi:
2033 return OpVectorvx<intrinsics::Vslvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2034 args.dst, args.src, UnsignedType{args.uimm});
2035 case Decoder::VOpIViOpcode::kVsrlvi:
2036 return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2037 args.dst, args.src, UnsignedType{args.uimm});
2038 case Decoder::VOpIViOpcode::kVsravi:
2039 // We need to pass shift value here as signed type but uimm value is always positive
2040 // and always fits into any integer.
2041 return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
2042 args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2043 case Decoder::VOpIViOpcode::kVmergevi:
2044 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2045 if (args.src != 0) {
2046 return Undefined();
2047 }
2048 return OpVectorx<intrinsics::Vcopyx<SignedType>, SignedType, vlmul, vta, vma>(
2049 args.dst, SignedType{args.imm});
2050 } else {
2051 return OpVectorx<intrinsics::Vcopyx<SignedType>,
2052 SignedType,
2053 vlmul,
2054 vta,
2055 // Always use "undisturbed" value from source register.
2056 InactiveProcessing::kUndisturbed>(
2057 args.dst, SignedType{args.imm}, /*dst_mask=*/args.src);
2058 }
2059 case Decoder::VOpIViOpcode::kVmvXrv:
2060 // kVmv<nr>rv instruction
2061 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2062 switch (args.imm) {
2063 case 0:
2064 return OpVectorVmvXrv<ElementType, 1>(args.dst, args.src);
2065 case 1:
2066 return OpVectorVmvXrv<ElementType, 2>(args.dst, args.src);
2067 case 3:
2068 return OpVectorVmvXrv<ElementType, 4>(args.dst, args.src);
2069 case 7:
2070 return OpVectorVmvXrv<ElementType, 8>(args.dst, args.src);
2071 default:
2072 return Undefined();
2073 }
2074 } else {
2075 return Undefined();
2076 }
2077 case Decoder::VOpIViOpcode::kVnsrawi:
2078 // We need to pass shift value here as signed type but uimm value is always positive
2079 // and always fits into any integer.
2080 return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
2081 args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2082 case Decoder::VOpIViOpcode::kVnsrlwi:
2083 return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2084 args.dst, args.src, UnsignedType{args.uimm});
2085 case Decoder::VOpIViOpcode::kVslideupvi:
2086 return OpVectorslideup<UnsignedType, vlmul, vta, vma>(
2087 args.dst, args.src, UnsignedType{args.uimm});
2088 case Decoder::VOpIViOpcode::kVslidedownvi:
2089 return OpVectorslidedown<UnsignedType, vlmul, vta, vma>(
2090 args.dst, args.src, UnsignedType{args.uimm});
2091 case Decoder::VOpIViOpcode::kVnclipuwi:
2092 return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>,
2093 SaturatingUnsignedType,
2094 vlmul,
2095 vta,
2096 vma,
2097 kVxrm>(args.dst, args.src, UnsignedType{args.uimm});
2098 case Decoder::VOpIViOpcode::kVnclipwi:
2099 return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>,
2100 SaturatingSignedType,
2101 vlmul,
2102 vta,
2103 vma,
2104 kVxrm>(args.dst, args.src, UnsignedType{args.uimm});
2105 case Decoder::VOpIViOpcode::kVssrlvi:
2106 return OpVectorvx<intrinsics::Vssrvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2107 args.dst, args.src, UnsignedType{args.uimm});
2108 case Decoder::VOpIViOpcode::kVssravi:
2109 return OpVectorvx<intrinsics::Vssrvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2110 args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2111 default:
2112 Undefined();
2113 }
2114 }
2115
2116 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIVvArgs & args)2117 void OpVector(const Decoder::VOpIVvArgs& args) {
2118 using SignedType = berberis::SignedType<ElementType>;
2119 using UnsignedType = berberis::UnsignedType<ElementType>;
2120 using SaturatingSignedType = SaturatingType<SignedType>;
2121 using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2122 // Keep cases sorted in opcode order to match RISC-V V manual.
2123 switch (args.opcode) {
2124 case Decoder::VOpIVvOpcode::kVaddvv:
2125 return OpVectorvv<intrinsics::Vaddvv<ElementType>, ElementType, vlmul, vta, vma>(
2126 args.dst, args.src1, args.src2);
2127 case Decoder::VOpIVvOpcode::kVsubvv:
2128 return OpVectorvv<intrinsics::Vsubvv<ElementType>, ElementType, vlmul, vta, vma>(
2129 args.dst, args.src1, args.src2);
2130 case Decoder::VOpIVvOpcode::kVandvv:
2131 return OpVectorvv<intrinsics::Vandvv<ElementType>, ElementType, vlmul, vta, vma>(
2132 args.dst, args.src1, args.src2);
2133 case Decoder::VOpIVvOpcode::kVorvv:
2134 return OpVectorvv<intrinsics::Vorvv<ElementType>, ElementType, vlmul, vta, vma>(
2135 args.dst, args.src1, args.src2);
2136 case Decoder::VOpIVvOpcode::kVxorvv:
2137 return OpVectorvv<intrinsics::Vxorvv<ElementType>, ElementType, vlmul, vta, vma>(
2138 args.dst, args.src1, args.src2);
2139 case Decoder::VOpIVvOpcode::kVrgathervv: {
2140 constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
2141 if (!IsAligned<kRegistersInvolved>(args.src2)) {
2142 return Undefined();
2143 }
2144 constexpr size_t vlmax = GetVlmax<ElementType, vlmul>();
2145 alignas(alignof(SIMD128Register)) ElementType indexes[vlmax];
2146 memcpy(indexes, state_->cpu.v + args.src2, sizeof(indexes));
2147 return OpVectorGather<ElementType, vlmul, vta, vma>(
2148 args.dst, args.src1, [&indexes](size_t index) { return indexes[index]; });
2149 }
2150 case Decoder::VOpIVvOpcode::kVmseqvv:
2151 return OpVectorToMaskvv<intrinsics::Vseqvv<ElementType>, ElementType, vlmul, vma>(
2152 args.dst, args.src1, args.src2);
2153 case Decoder::VOpIVvOpcode::kVmsnevv:
2154 return OpVectorToMaskvv<intrinsics::Vsnevv<ElementType>, ElementType, vlmul, vma>(
2155 args.dst, args.src1, args.src2);
2156 case Decoder::VOpIVvOpcode::kVmsltuvv:
2157 return OpVectorToMaskvv<intrinsics::Vsltvv<UnsignedType>, ElementType, vlmul, vma>(
2158 args.dst, args.src1, args.src2);
2159 case Decoder::VOpIVvOpcode::kVmsltvv:
2160 return OpVectorToMaskvv<intrinsics::Vsltvv<SignedType>, ElementType, vlmul, vma>(
2161 args.dst, args.src1, args.src2);
2162 case Decoder::VOpIVvOpcode::kVmsleuvv:
2163 return OpVectorToMaskvv<intrinsics::Vslevv<UnsignedType>, ElementType, vlmul, vma>(
2164 args.dst, args.src1, args.src2);
2165 case Decoder::VOpIVvOpcode::kVmslevv:
2166 return OpVectorToMaskvv<intrinsics::Vslevv<SignedType>, ElementType, vlmul, vma>(
2167 args.dst, args.src1, args.src2);
2168 case Decoder::VOpIVvOpcode::kVsadduvv:
2169 return OpVectorvv<intrinsics::Vaddvv<SaturatingUnsignedType>,
2170 SaturatingUnsignedType,
2171 vlmul,
2172 vta,
2173 vma>(args.dst, args.src1, args.src2);
2174 case Decoder::VOpIVvOpcode::kVsaddvv:
2175 return OpVectorvv<intrinsics::Vaddvv<SaturatingSignedType>,
2176 SaturatingSignedType,
2177 vlmul,
2178 vta,
2179 vma>(args.dst, args.src1, args.src2);
2180 case Decoder::VOpIVvOpcode::kVssubuvv:
2181 return OpVectorvv<intrinsics::Vsubvv<SaturatingUnsignedType>,
2182 SaturatingUnsignedType,
2183 vlmul,
2184 vta,
2185 vma>(args.dst, args.src1, args.src2);
2186 case Decoder::VOpIVvOpcode::kVssubvv:
2187 return OpVectorvv<intrinsics::Vsubvv<SaturatingSignedType>,
2188 SaturatingSignedType,
2189 vlmul,
2190 vta,
2191 vma>(args.dst, args.src1, args.src2);
2192 case Decoder::VOpIVvOpcode::kVsllvv:
2193 return OpVectorvv<intrinsics::Vslvv<ElementType>, ElementType, vlmul, vta, vma>(
2194 args.dst, args.src1, args.src2);
2195 case Decoder::VOpIVvOpcode::kVsrlvv:
2196 return OpVectorvv<intrinsics::Vsrvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2197 args.dst, args.src1, args.src2);
2198 case Decoder::VOpIVvOpcode::kVsravv:
2199 return OpVectorvv<intrinsics::Vsrvv<SignedType>, ElementType, vlmul, vta, vma>(
2200 args.dst, args.src1, args.src2);
2201 case Decoder::VOpIVvOpcode::kVminuvv:
2202 return OpVectorvv<intrinsics::Vminvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2203 args.dst, args.src1, args.src2);
2204 case Decoder::VOpIVvOpcode::kVminvv:
2205 return OpVectorvv<intrinsics::Vminvv<SignedType>, ElementType, vlmul, vta, vma>(
2206 args.dst, args.src1, args.src2);
2207 case Decoder::VOpIVvOpcode::kVmaxuvv:
2208 return OpVectorvv<intrinsics::Vmaxvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2209 args.dst, args.src1, args.src2);
2210 case Decoder::VOpIVvOpcode::kVmaxvv:
2211 return OpVectorvv<intrinsics::Vmaxvv<SignedType>, ElementType, vlmul, vta, vma>(
2212 args.dst, args.src1, args.src2);
2213 case Decoder::VOpIVvOpcode::kVmergevv:
2214 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2215 if (args.src1 != 0) {
2216 return Undefined();
2217 }
2218 return OpVectorv<intrinsics::Vcopyv<ElementType>, ElementType, vlmul, vta, vma>(
2219 args.dst, args.src2);
2220 } else {
2221 return OpVectorv<intrinsics::Vcopyv<ElementType>,
2222 ElementType,
2223 vlmul,
2224 vta,
2225 // Always use "undisturbed" value from source register.
2226 InactiveProcessing::kUndisturbed>(
2227 args.dst, args.src2, /*dst_mask=*/args.src1);
2228 }
2229 case Decoder::VOpIVvOpcode::kVnsrawv:
2230 return OpVectorNarrowwv<intrinsics::Vnsrwv<SignedType>, SignedType, vlmul, vta, vma>(
2231 args.dst, args.src1, args.src2);
2232 case Decoder::VOpIVvOpcode::kVnsrlwv:
2233 return OpVectorNarrowwv<intrinsics::Vnsrwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2234 args.dst, args.src1, args.src2);
2235 case Decoder::VOpIVvOpcode::kVsmulvv:
2236 return OpVectorvv<intrinsics::Vsmulvv<SaturatingSignedType>,
2237 ElementType,
2238 vlmul,
2239 vta,
2240 vma,
2241 kVxrm>(args.dst, args.src1, args.src2);
2242 case Decoder::VOpIVvOpcode::kVssrlvv:
2243 return OpVectorvv<intrinsics::Vssrvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2244 args.dst, args.src1, args.src2);
2245 case Decoder::VOpIVvOpcode::kVssravv:
2246 return OpVectorvv<intrinsics::Vssrvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2247 args.dst, args.src1, args.src2);
2248 case Decoder::VOpIVvOpcode::kVnclipuwv:
2249 return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingUnsignedType>,
2250 SaturatingUnsignedType,
2251 vlmul,
2252 vta,
2253 vma,
2254 kVxrm>(args.dst, args.src1, args.src2);
2255 case Decoder::VOpIVvOpcode::kVnclipwv:
2256 return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingSignedType>,
2257 SaturatingSignedType,
2258 vlmul,
2259 vta,
2260 vma,
2261 kVxrm>(args.dst, args.src1, args.src2);
2262 default:
2263 Undefined();
2264 }
2265 }
2266
2267 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIVxArgs & args,Register arg2)2268 void OpVector(const Decoder::VOpIVxArgs& args, Register arg2) {
2269 using SignedType = berberis::SignedType<ElementType>;
2270 using UnsignedType = berberis::UnsignedType<ElementType>;
2271 using SaturatingSignedType = SaturatingType<SignedType>;
2272 using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2273 // Keep cases sorted in opcode order to match RISC-V V manual.
2274 switch (args.opcode) {
2275 case Decoder::VOpIVxOpcode::kVaddvx:
2276 return OpVectorvx<intrinsics::Vaddvx<ElementType>, ElementType, vlmul, vta, vma>(
2277 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2278 case Decoder::VOpIVxOpcode::kVsubvx:
2279 return OpVectorvx<intrinsics::Vsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2280 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2281 case Decoder::VOpIVxOpcode::kVrsubvx:
2282 return OpVectorvx<intrinsics::Vrsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2283 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2284 case Decoder::VOpIVxOpcode::kVandvx:
2285 return OpVectorvx<intrinsics::Vandvx<ElementType>, ElementType, vlmul, vta, vma>(
2286 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2287 case Decoder::VOpIVxOpcode::kVorvx:
2288 return OpVectorvx<intrinsics::Vorvx<ElementType>, ElementType, vlmul, vta, vma>(
2289 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2290 case Decoder::VOpIVxOpcode::kVxorvx:
2291 return OpVectorvx<intrinsics::Vxorvx<ElementType>, ElementType, vlmul, vta, vma>(
2292 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2293 case Decoder::VOpIVxOpcode::kVrgathervx:
2294 return OpVectorGather<ElementType, vlmul, vta, vma>(
2295 args.dst, args.src1, [&arg2](size_t /*index*/) {
2296 return MaybeTruncateTo<ElementType>(arg2);
2297 });
2298 case Decoder::VOpIVxOpcode::kVmseqvx:
2299 return OpVectorToMaskvx<intrinsics::Vseqvx<ElementType>, ElementType, vlmul, vma>(
2300 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2301 case Decoder::VOpIVxOpcode::kVmsnevx:
2302 return OpVectorToMaskvx<intrinsics::Vsnevx<ElementType>, ElementType, vlmul, vma>(
2303 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2304 case Decoder::VOpIVxOpcode::kVmsltuvx:
2305 return OpVectorToMaskvx<intrinsics::Vsltvx<UnsignedType>, UnsignedType, vlmul, vma>(
2306 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2307 case Decoder::VOpIVxOpcode::kVmsltvx:
2308 return OpVectorToMaskvx<intrinsics::Vsltvx<SignedType>, SignedType, vlmul, vma>(
2309 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2310 case Decoder::VOpIVxOpcode::kVmsleuvx:
2311 return OpVectorToMaskvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
2312 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2313 case Decoder::VOpIVxOpcode::kVmslevx:
2314 return OpVectorToMaskvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
2315 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2316 case Decoder::VOpIVxOpcode::kVmsgtuvx:
2317 return OpVectorToMaskvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
2318 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2319 case Decoder::VOpIVxOpcode::kVmsgtvx:
2320 return OpVectorToMaskvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
2321 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2322 case Decoder::VOpIVxOpcode::kVsadduvx:
2323 return OpVectorvx<intrinsics::Vaddvx<SaturatingUnsignedType>,
2324 SaturatingUnsignedType,
2325 vlmul,
2326 vta,
2327 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2328 case Decoder::VOpIVxOpcode::kVsaddvx:
2329 return OpVectorvx<intrinsics::Vaddvx<SaturatingSignedType>,
2330 SaturatingSignedType,
2331 vlmul,
2332 vta,
2333 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2334 case Decoder::VOpIVxOpcode::kVssubuvx:
2335 return OpVectorvx<intrinsics::Vsubvx<SaturatingUnsignedType>,
2336 SaturatingUnsignedType,
2337 vlmul,
2338 vta,
2339 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2340 case Decoder::VOpIVxOpcode::kVssubvx:
2341 return OpVectorvx<intrinsics::Vsubvx<SaturatingSignedType>,
2342 SaturatingSignedType,
2343 vlmul,
2344 vta,
2345 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2346 case Decoder::VOpIVxOpcode::kVsllvx:
2347 return OpVectorvx<intrinsics::Vslvx<ElementType>, ElementType, vlmul, vta, vma>(
2348 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2349 case Decoder::VOpIVxOpcode::kVsrlvx:
2350 return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2351 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2352 case Decoder::VOpIVxOpcode::kVsravx:
2353 return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
2354 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2355 case Decoder::VOpIVxOpcode::kVminuvx:
2356 return OpVectorvx<intrinsics::Vminvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2357 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2358 case Decoder::VOpIVxOpcode::kVminvx:
2359 return OpVectorvx<intrinsics::Vminvx<SignedType>, SignedType, vlmul, vta, vma>(
2360 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2361 case Decoder::VOpIVxOpcode::kVmaxuvx:
2362 return OpVectorvx<intrinsics::Vmaxvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2363 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2364 case Decoder::VOpIVxOpcode::kVmaxvx:
2365 return OpVectorvx<intrinsics::Vmaxvx<SignedType>, SignedType, vlmul, vta, vma>(
2366 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2367 case Decoder::VOpIVxOpcode::kVmergevx:
2368 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2369 if (args.src1 != 0) {
2370 return Undefined();
2371 }
2372 return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(
2373 args.dst, MaybeTruncateTo<ElementType>(arg2));
2374 } else {
2375 return OpVectorx<intrinsics::Vcopyx<ElementType>,
2376 ElementType,
2377 vlmul,
2378 vta,
2379 // Always use "undisturbed" value from source register.
2380 InactiveProcessing::kUndisturbed>(
2381 args.dst, MaybeTruncateTo<ElementType>(arg2), /*dst_mask=*/args.src1);
2382 }
2383 case Decoder::VOpIVxOpcode::kVnsrawx:
2384 return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
2385 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2386 case Decoder::VOpIVxOpcode::kVnsrlwx:
2387 return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2388 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2389 case Decoder::VOpIVxOpcode::kVslideupvx:
2390 return OpVectorslideup<ElementType, vlmul, vta, vma>(
2391 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2392 case Decoder::VOpIVxOpcode::kVslidedownvx:
2393 return OpVectorslidedown<ElementType, vlmul, vta, vma>(
2394 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2395 case Decoder::VOpIVxOpcode::kVsmulvx:
2396 return OpVectorvx<intrinsics::Vsmulvx<SaturatingSignedType>,
2397 SaturatingSignedType,
2398 vlmul,
2399 vta,
2400 vma,
2401 kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2402 case Decoder::VOpIVxOpcode::kVssrlvx:
2403 return OpVectorvx<intrinsics::Vssrvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2404 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2405 case Decoder::VOpIVxOpcode::kVssravx:
2406 return OpVectorvx<intrinsics::Vssrvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2407 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2408 case Decoder::VOpIVxOpcode::kVnclipuwx:
2409 return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>,
2410 SaturatingUnsignedType,
2411 vlmul,
2412 vta,
2413 vma,
2414 kVxrm>(args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2415 case Decoder::VOpIVxOpcode::kVnclipwx:
2416 return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>,
2417 SaturatingSignedType,
2418 vlmul,
2419 vta,
2420 vma,
2421 kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2422 default:
2423 Undefined();
2424 }
2425 }
2426
2427 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpMVvArgs & args)2428 void OpVector(const Decoder::VOpMVvArgs& args) {
2429 using SignedType = berberis::SignedType<ElementType>;
2430 using UnsignedType = berberis::UnsignedType<ElementType>;
2431 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2432 // Keep cases sorted in opcode order to match RISC-V V manual.
2433 switch (args.opcode) {
2434 case Decoder::VOpMVvOpcode::kVmandnmm:
2435 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & ~rhs; }>(
2436 args.dst, args.src1, args.src2);
2437 case Decoder::VOpMVvOpcode::kVmandmm:
2438 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & rhs; }>(
2439 args.dst, args.src1, args.src2);
2440 case Decoder::VOpMVvOpcode::kVmormm:
2441 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | rhs; }>(
2442 args.dst, args.src1, args.src2);
2443 case Decoder::VOpMVvOpcode::kVmxormm:
2444 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs ^ rhs; }>(
2445 args.dst, args.src1, args.src2);
2446 case Decoder::VOpMVvOpcode::kVmornmm:
2447 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | ~rhs; }>(
2448 args.dst, args.src1, args.src2);
2449 case Decoder::VOpMVvOpcode::kVmnandmm:
2450 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs & rhs); }>(
2451 args.dst, args.src1, args.src2);
2452 case Decoder::VOpMVvOpcode::kVmnormm:
2453 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs | rhs); }>(
2454 args.dst, args.src1, args.src2);
2455 case Decoder::VOpMVvOpcode::kVmxnormm:
2456 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs ^ rhs); }>(
2457 args.dst, args.src1, args.src2);
2458 default:; // Do nothing: handled in next switch.
2459 }
2460 }
2461 // Keep cases sorted in opcode order to match RISC-V V manual.
2462 switch (args.opcode) {
2463 case Decoder::VOpMVvOpcode::kVredsumvs:
2464 return OpVectorvs<intrinsics::Vredsumvs<ElementType>, ElementType, vlmul, vta, vma>(
2465 args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2466 case Decoder::VOpMVvOpcode::kVredandvs:
2467 return OpVectorvs<intrinsics::Vredandvs<ElementType>, ElementType, vlmul, vta, vma>(
2468 args.dst, Vec<~ElementType{}>{args.src1}, args.src2);
2469 case Decoder::VOpMVvOpcode::kVredorvs:
2470 return OpVectorvs<intrinsics::Vredorvs<ElementType>, ElementType, vlmul, vta, vma>(
2471 args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2472 case Decoder::VOpMVvOpcode::kVredxorvs:
2473 return OpVectorvs<intrinsics::Vredxorvs<ElementType>, ElementType, vlmul, vta, vma>(
2474 args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2475 case Decoder::VOpMVvOpcode::kVredminuvs:
2476 return OpVectorvs<intrinsics::Vredminvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2477 args.dst,
2478 Vec<UnsignedType{std::numeric_limits<typename UnsignedType::BaseType>::max()}>{
2479 args.src1},
2480 args.src2);
2481 case Decoder::VOpMVvOpcode::kVredminvs:
2482 return OpVectorvs<intrinsics::Vredminvs<SignedType>, SignedType, vlmul, vta, vma>(
2483 args.dst,
2484 Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::max()}>{args.src1},
2485 args.src2);
2486 case Decoder::VOpMVvOpcode::kVredmaxuvs:
2487 return OpVectorvs<intrinsics::Vredmaxvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2488 args.dst, Vec<UnsignedType{}>{args.src1}, args.src2);
2489 case Decoder::VOpMVvOpcode::kVredmaxvs:
2490 return OpVectorvs<intrinsics::Vredmaxvs<SignedType>, SignedType, vlmul, vta, vma>(
2491 args.dst,
2492 Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::min()}>{args.src1},
2493 args.src2);
2494 case Decoder::VOpMVvOpcode::kVaadduvv:
2495 return OpVectorvv<intrinsics::Vaaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2496 args.dst, args.src1, args.src2);
2497 case Decoder::VOpMVvOpcode::kVaaddvv:
2498 return OpVectorvv<intrinsics::Vaaddvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2499 args.dst, args.src1, args.src2);
2500 case Decoder::VOpMVvOpcode::kVasubuvv:
2501 return OpVectorvv<intrinsics::Vasubvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2502 args.dst, args.src1, args.src2);
2503 case Decoder::VOpMVvOpcode::kVasubvv:
2504 return OpVectorvv<intrinsics::Vasubvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2505 args.dst, args.src1, args.src2);
2506 case Decoder::VOpMVvOpcode::kVWXUnary0:
2507 switch (args.vwxunary0_opcode) {
2508 case Decoder::VWXUnary0Opcode::kVmvxs:
2509 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2510 return Undefined();
2511 }
2512 return OpVectorVmvxs<SignedType>(args.dst, args.src1);
2513 case Decoder::VWXUnary0Opcode::kVcpopm:
2514 return OpVectorVWXUnary0<intrinsics::Vcpopm<>, vma>(args.dst, args.src1);
2515 case Decoder::VWXUnary0Opcode::kVfirstm:
2516 return OpVectorVWXUnary0<intrinsics::Vfirstm<>, vma>(args.dst, args.src1);
2517 default:
2518 return Undefined();
2519 }
2520 case Decoder::VOpMVvOpcode::kVFUnary0:
2521 switch (args.vxunary0_opcode) {
2522 case Decoder::VXUnary0Opcode::kVzextvf2m:
2523 if constexpr (sizeof(UnsignedType) >= 2) {
2524 return OpVectorVXUnary0<intrinsics::Vextf2<UnsignedType>,
2525 UnsignedType,
2526 2,
2527 vlmul,
2528 vta,
2529 vma>(args.dst, args.src1);
2530 }
2531 break;
2532 case Decoder::VXUnary0Opcode::kVsextvf2m:
2533 if constexpr (sizeof(SignedType) >= 2) {
2534 return OpVectorVXUnary0<intrinsics::Vextf2<SignedType>,
2535 SignedType,
2536 2,
2537 vlmul,
2538 vta,
2539 vma>(args.dst, args.src1);
2540 }
2541 break;
2542 case Decoder::VXUnary0Opcode::kVzextvf4m:
2543 if constexpr (sizeof(UnsignedType) >= 4) {
2544 return OpVectorVXUnary0<intrinsics::Vextf4<UnsignedType>,
2545 UnsignedType,
2546 4,
2547 vlmul,
2548 vta,
2549 vma>(args.dst, args.src1);
2550 }
2551 break;
2552 case Decoder::VXUnary0Opcode::kVsextvf4m:
2553 if constexpr (sizeof(SignedType) >= 4) {
2554 return OpVectorVXUnary0<intrinsics::Vextf4<SignedType>,
2555 SignedType,
2556 4,
2557 vlmul,
2558 vta,
2559 vma>(args.dst, args.src1);
2560 }
2561 break;
2562 case Decoder::VXUnary0Opcode::kVzextvf8m:
2563 if constexpr (sizeof(UnsignedType) >= 8) {
2564 return OpVectorVXUnary0<intrinsics::Vextf8<UnsignedType>,
2565 UnsignedType,
2566 8,
2567 vlmul,
2568 vta,
2569 vma>(args.dst, args.src1);
2570 }
2571 break;
2572 case Decoder::VXUnary0Opcode::kVsextvf8m:
2573 if constexpr (sizeof(SignedType) >= 8) {
2574 return OpVectorVXUnary0<intrinsics::Vextf8<SignedType>,
2575 SignedType,
2576 8,
2577 vlmul,
2578 vta,
2579 vma>(args.dst, args.src1);
2580 }
2581 break;
2582 default:
2583 return Undefined();
2584 }
2585 return Undefined();
2586 case Decoder::VOpMVvOpcode::kVMUnary0:
2587 switch (args.vmunary0_opcode) {
2588 case Decoder::VMUnary0Opcode::kVmsbfm:
2589 return OpVectorVMUnary0<intrinsics::Vmsbfm<>, vma>(args.dst, args.src1);
2590 case Decoder::VMUnary0Opcode::kVmsofm:
2591 return OpVectorVMUnary0<intrinsics::Vmsofm<>, vma>(args.dst, args.src1);
2592 case Decoder::VMUnary0Opcode::kVmsifm:
2593 return OpVectorVMUnary0<intrinsics::Vmsifm<>, vma>(args.dst, args.src1);
2594 case Decoder::VMUnary0Opcode::kViotam:
2595 return OpVectorViotam<ElementType, vlmul, vta, vma>(args.dst, args.src1);
2596 case Decoder::VMUnary0Opcode::kVidv:
2597 if (args.src1) {
2598 return Undefined();
2599 }
2600 return OpVectorVidv<ElementType, vlmul, vta, vma>(args.dst);
2601 default:
2602 return Undefined();
2603 }
2604 case Decoder::VOpMVvOpcode::kVdivuvv:
2605 return OpVectorvv<intrinsics::Vdivvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2606 args.dst, args.src1, args.src2);
2607 case Decoder::VOpMVvOpcode::kVdivvv:
2608 return OpVectorvv<intrinsics::Vdivvv<SignedType>, SignedType, vlmul, vta, vma>(
2609 args.dst, args.src1, args.src2);
2610 case Decoder::VOpMVvOpcode::kVremuvv:
2611 return OpVectorvv<intrinsics::Vremvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2612 args.dst, args.src1, args.src2);
2613 case Decoder::VOpMVvOpcode::kVremvv:
2614 return OpVectorvv<intrinsics::Vremvv<SignedType>, SignedType, vlmul, vta, vma>(
2615 args.dst, args.src1, args.src2);
2616 case Decoder::VOpMVvOpcode::kVmulhuvv:
2617 return OpVectorvv<intrinsics::Vmulhvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2618 args.dst, args.src1, args.src2);
2619 case Decoder::VOpMVvOpcode::kVmulvv:
2620 return OpVectorvv<intrinsics::Vmulvv<SignedType>, SignedType, vlmul, vta, vma>(
2621 args.dst, args.src1, args.src2);
2622 case Decoder::VOpMVvOpcode::kVmulhsuvv:
2623 return OpVectorvv<intrinsics::Vmulhsuvv<SignedType>, SignedType, vlmul, vta, vma>(
2624 args.dst, args.src1, args.src2);
2625 case Decoder::VOpMVvOpcode::kVmulhvv:
2626 return OpVectorvv<intrinsics::Vmulhvv<SignedType>, SignedType, vlmul, vta, vma>(
2627 args.dst, args.src1, args.src2);
2628 case Decoder::VOpMVvOpcode::kVmaddvv:
2629 return OpVectorvvv<intrinsics::Vmaddvv<ElementType>, ElementType, vlmul, vta, vma>(
2630 args.dst, args.src1, args.src2);
2631 case Decoder::VOpMVvOpcode::kVnmsubvv:
2632 return OpVectorvvv<intrinsics::Vnmsubvv<ElementType>, ElementType, vlmul, vta, vma>(
2633 args.dst, args.src1, args.src2);
2634 case Decoder::VOpMVvOpcode::kVmaccvv:
2635 return OpVectorvvv<intrinsics::Vmaccvv<ElementType>, ElementType, vlmul, vta, vma>(
2636 args.dst, args.src1, args.src2);
2637 case Decoder::VOpMVvOpcode::kVnmsacvv:
2638 return OpVectorvvv<intrinsics::Vnmsacvv<ElementType>, ElementType, vlmul, vta, vma>(
2639 args.dst, args.src1, args.src2);
2640 case Decoder::VOpMVvOpcode::kVwadduvv:
2641 return OpVectorWidenvv<intrinsics::Vwaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2642 args.dst, args.src1, args.src2);
2643 case Decoder::VOpMVvOpcode::kVwaddvv:
2644 return OpVectorWidenvv<intrinsics::Vwaddvv<SignedType>, SignedType, vlmul, vta, vma>(
2645 args.dst, args.src1, args.src2);
2646 case Decoder::VOpMVvOpcode::kVwsubuvv:
2647 return OpVectorWidenvv<intrinsics::Vwsubvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2648 args.dst, args.src1, args.src2);
2649 case Decoder::VOpMVvOpcode::kVwsubvv:
2650 return OpVectorWidenvv<intrinsics::Vwsubvv<SignedType>, SignedType, vlmul, vta, vma>(
2651 args.dst, args.src1, args.src2);
2652 case Decoder::VOpMVvOpcode::kVwadduwv:
2653 return OpVectorWidenwv<intrinsics::Vwaddwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2654 args.dst, args.src1, args.src2);
2655 case Decoder::VOpMVvOpcode::kVwaddwv:
2656 return OpVectorWidenwv<intrinsics::Vwaddwv<SignedType>, SignedType, vlmul, vta, vma>(
2657 args.dst, args.src1, args.src2);
2658 case Decoder::VOpMVvOpcode::kVwsubuwv:
2659 return OpVectorWidenwv<intrinsics::Vwsubwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2660 args.dst, args.src1, args.src2);
2661 case Decoder::VOpMVvOpcode::kVwsubwv:
2662 return OpVectorWidenwv<intrinsics::Vwsubwv<SignedType>, SignedType, vlmul, vta, vma>(
2663 args.dst, args.src1, args.src2);
2664 case Decoder::VOpMVvOpcode::kVwmuluvv:
2665 return OpVectorWidenvv<intrinsics::Vwmulvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2666 args.dst, args.src1, args.src2);
2667 case Decoder::VOpMVvOpcode::kVwmulsuvv:
2668 return OpVectorWidenvv<intrinsics::Vwmulsuvv<ElementType>, ElementType, vlmul, vta, vma>(
2669 args.dst, args.src1, args.src2);
2670 case Decoder::VOpMVvOpcode::kVwmulvv:
2671 return OpVectorWidenvv<intrinsics::Vwmulvv<SignedType>, SignedType, vlmul, vta, vma>(
2672 args.dst, args.src1, args.src2);
2673 case Decoder::VOpMVvOpcode::kVwmaccuvv:
2674 return OpVectorWidenvvw<intrinsics::Vwmaccvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2675 args.dst, args.src1, args.src2);
2676 case Decoder::VOpMVvOpcode::kVwmaccvv:
2677 return OpVectorWidenvvw<intrinsics::Vwmaccvv<SignedType>, SignedType, vlmul, vta, vma>(
2678 args.dst, args.src1, args.src2);
2679 case Decoder::VOpMVvOpcode::kVwmaccsuvv:
2680 return OpVectorWidenvvw<intrinsics::Vwmaccsuvv<ElementType>, ElementType, vlmul, vta, vma>(
2681 args.dst, args.src1, args.src2);
2682 default:
2683 Undefined();
2684 }
2685 }
2686
2687 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpMVxArgs & args,Register arg2)2688 void OpVector(const Decoder::VOpMVxArgs& args, Register arg2) {
2689 using SignedType = berberis::SignedType<ElementType>;
2690 using UnsignedType = berberis::UnsignedType<ElementType>;
2691 // Keep cases sorted in opcode order to match RISC-V V manual.
2692 switch (args.opcode) {
2693 case Decoder::VOpMVxOpcode::kVaadduvx:
2694 return OpVectorvx<intrinsics::Vaaddvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2695 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2696 case Decoder::VOpMVxOpcode::kVaaddvx:
2697 return OpVectorvx<intrinsics::Vaaddvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2698 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2699 case Decoder::VOpMVxOpcode::kVasubuvx:
2700 return OpVectorvx<intrinsics::Vasubvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2701 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2702 case Decoder::VOpMVxOpcode::kVasubvx:
2703 return OpVectorvx<intrinsics::Vasubvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2704 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2705 case Decoder::VOpMVxOpcode::kVslide1upvx:
2706 return OpVectorslide1up<SignedType, vlmul, vta, vma>(
2707 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2708 case Decoder::VOpMVxOpcode::kVslide1downvx:
2709 return OpVectorslide1down<SignedType, vlmul, vta, vma>(
2710 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2711 case Decoder::VOpMVxOpcode::kVRXUnary0:
2712 switch (args.vrxunary0_opcode) {
2713 case Decoder::VRXUnary0Opcode::kVmvsx:
2714 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2715 return Undefined();
2716 }
2717 return OpVectorVmvsx<SignedType, vta>(args.dst, MaybeTruncateTo<SignedType>(arg2));
2718 default:
2719 return Undefined();
2720 }
2721 case Decoder::VOpMVxOpcode::kVmulhuvx:
2722 return OpVectorvx<intrinsics::Vmulhvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2723 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2724 case Decoder::VOpMVxOpcode::kVmulvx:
2725 return OpVectorvx<intrinsics::Vmulvx<SignedType>, SignedType, vlmul, vta, vma>(
2726 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2727 case Decoder::VOpMVxOpcode::kVdivuvx:
2728 return OpVectorvx<intrinsics::Vdivvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2729 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2730 case Decoder::VOpMVxOpcode::kVdivvx:
2731 return OpVectorvx<intrinsics::Vdivvx<SignedType>, SignedType, vlmul, vta, vma>(
2732 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2733 case Decoder::VOpMVxOpcode::kVremuvx:
2734 return OpVectorvx<intrinsics::Vremvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2735 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2736 case Decoder::VOpMVxOpcode::kVremvx:
2737 return OpVectorvx<intrinsics::Vremvx<SignedType>, SignedType, vlmul, vta, vma>(
2738 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2739 case Decoder::VOpMVxOpcode::kVmulhsuvx:
2740 return OpVectorvx<intrinsics::Vmulhsuvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2741 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2742 case Decoder::VOpMVxOpcode::kVmulhvx:
2743 return OpVectorvx<intrinsics::Vmulhvx<SignedType>, SignedType, vlmul, vta, vma>(
2744 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2745 case Decoder::VOpMVxOpcode::kVmaddvx:
2746 return OpVectorvxv<intrinsics::Vmaddvx<ElementType>, ElementType, vlmul, vta, vma>(
2747 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2748 case Decoder::VOpMVxOpcode::kVnmsubvx:
2749 return OpVectorvxv<intrinsics::Vnmsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2750 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2751 case Decoder::VOpMVxOpcode::kVmaccvx:
2752 return OpVectorvxv<intrinsics::Vmaccvx<ElementType>, ElementType, vlmul, vta, vma>(
2753 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2754 case Decoder::VOpMVxOpcode::kVnmsacvx:
2755 return OpVectorvxv<intrinsics::Vnmsacvx<ElementType>, ElementType, vlmul, vta, vma>(
2756 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2757 case Decoder::VOpMVxOpcode::kVwadduvx:
2758 return OpVectorWidenvx<intrinsics::Vwaddvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2759 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2760 case Decoder::VOpMVxOpcode::kVwaddvx:
2761 return OpVectorWidenvx<intrinsics::Vwaddvx<SignedType>, SignedType, vlmul, vta, vma>(
2762 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2763 case Decoder::VOpMVxOpcode::kVwsubuvx:
2764 return OpVectorWidenvx<intrinsics::Vwsubvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2765 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2766 case Decoder::VOpMVxOpcode::kVwsubvx:
2767 return OpVectorWidenvx<intrinsics::Vwsubvx<SignedType>, SignedType, vlmul, vta, vma>(
2768 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2769 case Decoder::VOpMVxOpcode::kVwadduwx:
2770 return OpVectorWidenwx<intrinsics::Vwaddwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2771 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2772 case Decoder::VOpMVxOpcode::kVwaddwx:
2773 return OpVectorWidenwx<intrinsics::Vwaddwx<SignedType>, SignedType, vlmul, vta, vma>(
2774 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2775 case Decoder::VOpMVxOpcode::kVwsubuwx:
2776 return OpVectorWidenwx<intrinsics::Vwsubwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2777 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2778 case Decoder::VOpMVxOpcode::kVwsubwx:
2779 return OpVectorWidenwx<intrinsics::Vwsubwx<SignedType>, SignedType, vlmul, vta, vma>(
2780 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2781 case Decoder::VOpMVxOpcode::kVwmuluvx:
2782 return OpVectorWidenvx<intrinsics::Vwmulvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2783 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2784 case Decoder::VOpMVxOpcode::kVwmulsuvx:
2785 return OpVectorWidenvx<intrinsics::Vwmulsuvx<ElementType>, ElementType, vlmul, vta, vma>(
2786 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2787 case Decoder::VOpMVxOpcode::kVwmulvx:
2788 return OpVectorWidenvx<intrinsics::Vwmulvx<SignedType>, SignedType, vlmul, vta, vma>(
2789 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2790 case Decoder::VOpMVxOpcode::kVwmaccuvx:
2791 return OpVectorWidenvxw<intrinsics::Vwmaccvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2792 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2793 case Decoder::VOpMVxOpcode::kVwmaccvx:
2794 return OpVectorWidenvxw<intrinsics::Vwmaccvx<SignedType>, SignedType, vlmul, vta, vma>(
2795 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2796 case Decoder::VOpMVxOpcode::kVwmaccusvx:
2797 return OpVectorWidenvxw<intrinsics::Vwmaccusvx<ElementType>, ElementType, vlmul, vta, vma>(
2798 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2799 case Decoder::VOpMVxOpcode::kVwmaccsuvx:
2800 return OpVectorWidenvxw<intrinsics::Vwmaccsuvx<ElementType>, ElementType, vlmul, vta, vma>(
2801 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2802 default:
2803 Undefined();
2804 }
2805 }
2806
2807 template <typename DataElementType,
2808 VectorRegisterGroupMultiplier vlmul,
2809 typename IndexElementType,
2810 size_t kSegmentSize,
2811 size_t kIndexRegistersInvolved,
2812 TailProcessing vta,
2813 auto vma>
OpVector(const Decoder::VStoreIndexedArgs & args,Register src)2814 void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
2815 return OpVector<DataElementType,
2816 kSegmentSize,
2817 NumberOfRegistersInvolved(vlmul),
2818 IndexElementType,
2819 kIndexRegistersInvolved,
2820 !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(args, src);
2821 }
2822
2823 template <typename DataElementType,
2824 size_t kSegmentSize,
2825 size_t kNumRegistersInGroup,
2826 typename IndexElementType,
2827 size_t kIndexRegistersInvolved,
2828 bool kUseMasking>
OpVector(const Decoder::VStoreIndexedArgs & args,Register src)2829 void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
2830 if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
2831 return Undefined();
2832 }
2833 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
2834 alignas(alignof(SIMD128Register))
2835 IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
2836 memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
2837 return OpVectorStore<DataElementType, kSegmentSize, kNumRegistersInGroup, kUseMasking>(
2838 args.data, src, [&indexes](size_t index) { return indexes[index]; });
2839 }
2840
2841 template <typename ElementType,
2842 size_t kSegmentSize,
2843 VectorRegisterGroupMultiplier vlmul,
2844 TailProcessing vta,
2845 auto vma>
OpVector(const Decoder::VStoreStrideArgs & args,Register src,Register stride)2846 void OpVector(const Decoder::VStoreStrideArgs& args, Register src, Register stride) {
2847 return OpVectorStore<ElementType,
2848 kSegmentSize,
2849 NumberOfRegistersInvolved(vlmul),
2850 !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(
2851 args.data, src, [stride](size_t index) { return stride * index; });
2852 }
2853
2854 template <typename ElementType,
2855 size_t kSegmentSize,
2856 VectorRegisterGroupMultiplier vlmul,
2857 TailProcessing vta,
2858 auto vma>
OpVector(const Decoder::VStoreUnitStrideArgs & args,Register src)2859 void OpVector(const Decoder::VStoreUnitStrideArgs& args, Register src) {
2860 switch (args.opcode) {
2861 case Decoder::VSUmOpOpcode::kVseXX:
2862 return OpVectorStore<ElementType,
2863 kSegmentSize,
2864 NumberOfRegistersInvolved(vlmul),
2865 !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
2866 Decoder::VSUmOpOpcode::kVseXX>(args.data, src, [](size_t index) {
2867 return kSegmentSize * sizeof(ElementType) * index;
2868 });
2869 case Decoder::VSUmOpOpcode::kVsm:
2870 if constexpr (kSegmentSize == 1 &&
2871 std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2872 return OpVectorStore<UInt8,
2873 1,
2874 1,
2875 /*kUseMasking=*/false,
2876 Decoder::VSUmOpOpcode::kVsm>(
2877 args.data, src, [](size_t index) { return index; });
2878 }
2879 return Undefined();
2880 default:
2881 return Undefined();
2882 }
2883 }
2884
2885 // Look for VLoadStrideArgs for explanation about semantics: VStoreStrideArgs is almost symmetric,
2886 // except it ignores vta and vma modes and never alters inactive elements in memory.
2887 template <typename ElementType,
2888 size_t kSegmentSize,
2889 size_t kNumRegistersInGroup,
2890 bool kUseMasking,
2891 typename Decoder::VSUmOpOpcode opcode = typename Decoder::VSUmOpOpcode{},
2892 typename GetElementOffsetLambdaType>
2893 void OpVectorStore(uint8_t data, Register src, GetElementOffsetLambdaType GetElementOffset) {
2894 using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
2895 if (!IsAligned<kNumRegistersInGroup>(data)) {
2896 return Undefined();
2897 }
2898 if (data + kNumRegistersInGroup * kSegmentSize > 32) {
2899 return Undefined();
2900 }
2901 constexpr size_t kElementsCount = 16 / sizeof(ElementType);
2902 size_t vstart = GetCsr<CsrName::kVstart>();
2903 size_t vl = GetCsr<CsrName::kVl>();
2904 if constexpr (opcode == Decoder::VSUmOpOpcode::kVsm) {
2905 vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
2906 }
2907 // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
2908 // simplify the logic below.
2909 SetCsr<CsrName::kVstart>(0);
2910 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
2911 // vector register group, including that no tail elements are updated with agnostic values.
2912 if (vstart >= vl) [[unlikely]] {
2913 // Technically, since stores never touch tail elements it's not needed, but makes it easier to
2914 // reason about the rest of function.
2915 return;
2916 }
2917 char* ptr = ToHostAddr<char>(src);
2918 // Note: within_group_id is the current register id within a register group. During one
2919 // iteration of this loop we store results for all registers with the current id in all
2920 // groups. E.g. for the example above we'd store data from v0, v2, v4 during the first iteration
2921 // (id within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
2922 // ensures that memory is always accessed in ordered fashion.
2923 auto mask = GetMaskForVectorOperationsIfNeeded<kUseMasking>();
2924 for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
2925 ++within_group_id) {
2926 // No need to continue if we no longer have elements to store.
2927 if (within_group_id * kElementsCount >= vl) {
2928 break;
2929 }
2930 auto register_mask =
2931 std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
2932 // Store elements to memory, but only if there are any active ones.
2933 for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
2934 ++within_register_id) {
2935 size_t element_index = kElementsCount * within_group_id + within_register_id;
2936 // Stop if we reached the vl limit.
2937 if (vl <= element_index) {
2938 break;
2939 }
2940 // Don't touch masked-out elements.
2941 if constexpr (kUseMasking) {
2942 if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
2943 1 << within_register_id)}) == MaskType{0}) {
2944 continue;
2945 }
2946 }
2947 // Store segment to memory.
2948 for (size_t field = 0; field < kSegmentSize; ++field) {
2949 bool exception_raised = FaultyStore(
2950 ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
2951 sizeof(ElementType),
2952 SIMD128Register{state_->cpu.v[data + within_group_id + field * kNumRegistersInGroup]}
2953 .Get<ElementType>(within_register_id));
2954 // Stop processing if memory is inaccessible. It's also the only case where we have to set
2955 // vstart to non-zero value!
2956 if (exception_raised) {
2957 SetCsr<CsrName::kVstart>(element_index);
2958 return;
2959 }
2960 }
2961 }
2962 // Next group should be fully processed.
2963 vstart = 0;
2964 }
2965 }
2966
2967 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorViotam(uint8_t dst,uint8_t src1)2968 void OpVectorViotam(uint8_t dst, uint8_t src1) {
2969 return OpVectorViotam<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src1);
2970 }
2971
2972 template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorViotam(uint8_t dst,uint8_t src1)2973 void OpVectorViotam(uint8_t dst, uint8_t src1) {
2974 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
2975 size_t vstart = GetCsr<CsrName::kVstart>();
2976 size_t vl = GetCsr<CsrName::kVl>();
2977 if (vstart != 0) {
2978 return Undefined();
2979 }
2980 // When vl = 0, there are no body elements, and no elements are updated in any destination
2981 // vector register group, including that no tail elements are updated with agnostic values.
2982 if (vl == 0) [[unlikely]] {
2983 return;
2984 }
2985 SIMD128Register arg1(state_->cpu.v[src1]);
2986 auto mask = GetMaskForVectorOperations<vma>();
2987 if constexpr (std::is_same_v<decltype(mask), SIMD128Register>) {
2988 arg1 &= mask;
2989 }
2990
2991 size_t counter = 0;
2992 for (size_t index = 0; index < kRegistersInvolved; ++index) {
2993 SIMD128Register result{state_->cpu.v[dst + index]};
2994 auto [original_dst_value, new_counter] = intrinsics::Viotam<ElementType>(arg1, counter);
2995 arg1.Set(arg1.Get<__uint128_t>() >> kElementsCount);
2996 counter = new_counter;
2997
2998 // Apply mask and put result values into dst register.
2999 result =
3000 VectorMasking<ElementType, vta, vma>(result, original_dst_value, vstart, vl, index, mask);
3001 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3002 }
3003 }
3004
3005 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorVidv(uint8_t dst)3006 void OpVectorVidv(uint8_t dst) {
3007 return OpVectorVidv<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst);
3008 }
3009
3010 template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorVidv(uint8_t dst)3011 void OpVectorVidv(uint8_t dst) {
3012 if (!IsAligned<kRegistersInvolved>(dst)) {
3013 return Undefined();
3014 }
3015 size_t vstart = GetCsr<CsrName::kVstart>();
3016 size_t vl = GetCsr<CsrName::kVl>();
3017 SetCsr<CsrName::kVstart>(0);
3018 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3019 // vector register group, including that no tail elements are updated with agnostic values.
3020 if (vstart >= vl) [[unlikely]] {
3021 return;
3022 }
3023 auto mask = GetMaskForVectorOperations<vma>();
3024 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3025 SIMD128Register result{state_->cpu.v[dst + index]};
3026 result = VectorMasking<ElementType, vta, vma>(
3027 result, std::get<0>(intrinsics::Vidv<ElementType>(index)), vstart, vl, index, mask);
3028 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3029 }
3030 }
3031
3032 template <typename ElementType>
OpVectorVmvfs(uint8_t dst,uint8_t src)3033 void OpVectorVmvfs(uint8_t dst, uint8_t src) {
3034 // Note: intrinsics::NanBox always received Float64 argument, even if it processes Float32 value
3035 // to not cause recursion in interinsics handling.
3036 // NanBox in the interpreter takes FpRegister and returns FpRegister which is probably the
3037 // cleanest way of processing that data (at least on x86-64 this produces code that's close to
3038 // optimal).
3039 NanBoxAndSetFpReg<ElementType>(dst, SIMD128Register{state_->cpu.v[src]}.Get<FpRegister>(0));
3040 SetCsr<CsrName::kVstart>(0);
3041 }
3042
3043 template <typename ElementType, TailProcessing vta>
OpVectorVmvsx(uint8_t dst,ElementType element)3044 void OpVectorVmvsx(uint8_t dst, ElementType element) {
3045 size_t vstart = GetCsr<CsrName::kVstart>();
3046 size_t vl = GetCsr<CsrName::kVl>();
3047 // Documentation doesn't specify what happenes when vstart is non-zero but less than vl.
3048 // But at least one hardware implementation treats it as NOP:
3049 // https://github.com/riscv/riscv-v-spec/issues/937
3050 // We are doing the same here.
3051 if (vstart == 0 && vl != 0) [[likely]] {
3052 SIMD128Register result;
3053 if constexpr (vta == intrinsics::TailProcessing::kAgnostic) {
3054 result = ~SIMD128Register{};
3055 } else {
3056 result.Set(state_->cpu.v[dst]);
3057 }
3058 result.Set(element, 0);
3059 state_->cpu.v[dst] = result.Get<Int128>();
3060 }
3061 SetCsr<CsrName::kVstart>(0);
3062 }
3063
3064 template <typename ElementType>
OpVectorVmvxs(uint8_t dst,uint8_t src1)3065 void OpVectorVmvxs(uint8_t dst, uint8_t src1) {
3066 static_assert(ElementType::kIsSigned);
3067 // Conversion to Int64 would perform sign-extension if source element is signed.
3068 Register element = Int64{SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0)};
3069 SetRegOrIgnore(dst, element);
3070 SetCsr<CsrName::kVstart>(0);
3071 }
3072
3073 template <auto Intrinsic, auto vma>
OpVectorVWXUnary0(uint8_t dst,uint8_t src1)3074 void OpVectorVWXUnary0(uint8_t dst, uint8_t src1) {
3075 size_t vstart = GetCsr<CsrName::kVstart>();
3076 size_t vl = GetCsr<CsrName::kVl>();
3077 if (vstart != 0) [[unlikely]] {
3078 return Undefined();
3079 }
3080 // Note: vcpop.m and vfirst.m are explicit exception to the rule that vstart >= vl doesn't
3081 // perform any operations, and they are explicitly defined to perform write even if vl == 0.
3082 SIMD128Register arg1(state_->cpu.v[src1]);
3083 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3084 SIMD128Register mask(state_->cpu.v[0]);
3085 arg1 &= mask;
3086 }
3087 const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3088 arg1 &= ~tail_mask;
3089 SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
3090 SetRegOrIgnore(dst, TruncateTo<UInt64>(BitCastToUnsigned(result.Get<Int128>())));
3091 }
3092
3093 template <auto Intrinsic>
OpVectormm(uint8_t dst,uint8_t src1,uint8_t src2)3094 void OpVectormm(uint8_t dst, uint8_t src1, uint8_t src2) {
3095 size_t vstart = GetCsr<CsrName::kVstart>();
3096 size_t vl = GetCsr<CsrName::kVl>();
3097 SetCsr<CsrName::kVstart>(0);
3098 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3099 // vector register group, including that no tail elements are updated with agnostic values.
3100 if (vstart >= vl) [[unlikely]] {
3101 return;
3102 }
3103 SIMD128Register arg1(state_->cpu.v[src1]);
3104 SIMD128Register arg2(state_->cpu.v[src2]);
3105 SIMD128Register result;
3106 if (vstart > 0) [[unlikely]] {
3107 const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
3108 result.Set(state_->cpu.v[dst]);
3109 result = (result & ~start_mask) | (Intrinsic(arg1, arg2) & start_mask);
3110 } else {
3111 result = Intrinsic(arg1, arg2);
3112 }
3113 const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3114 result = result | tail_mask;
3115 state_->cpu.v[dst] = result.Get<__uint128_t>();
3116 }
3117
3118 template <auto Intrinsic, auto vma>
OpVectorVMUnary0(uint8_t dst,uint8_t src1)3119 void OpVectorVMUnary0(uint8_t dst, uint8_t src1) {
3120 size_t vstart = GetCsr<CsrName::kVstart>();
3121 size_t vl = GetCsr<CsrName::kVl>();
3122 if (vstart != 0) {
3123 return Undefined();
3124 }
3125 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3126 // vector register group, including that no tail elements are updated with agnostic values.
3127 if (vl == 0) [[unlikely]] {
3128 return;
3129 }
3130 SIMD128Register arg1(state_->cpu.v[src1]);
3131 SIMD128Register mask;
3132 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3133 mask.Set<__uint128_t>(state_->cpu.v[0]);
3134 arg1 &= mask;
3135 }
3136 const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3137 arg1 &= ~tail_mask;
3138 SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
3139 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3140 arg1 &= mask;
3141 if (vma == InactiveProcessing::kUndisturbed) {
3142 result = (result & mask) | (SIMD128Register(state_->cpu.v[dst]) & ~mask);
3143 } else {
3144 result |= ~mask;
3145 }
3146 }
3147 result |= tail_mask;
3148 state_->cpu.v[dst] = result.Get<__uint128_t>();
3149 }
3150
3151 template <typename ElementType, size_t kRegistersInvolved>
OpVectorVmvXrv(uint8_t dst,uint8_t src)3152 void OpVectorVmvXrv(uint8_t dst, uint8_t src) {
3153 if (!IsAligned<kRegistersInvolved>(dst | src)) {
3154 return Undefined();
3155 }
3156 constexpr size_t kElementsCount = 16 / sizeof(ElementType);
3157 size_t vstart = GetCsr<CsrName::kVstart>();
3158 SetCsr<CsrName::kVstart>(0);
3159 // The usual property that no elements are written if vstart >= vl does not apply to these
3160 // instructions. Instead, no elements are written if vstart >= evl.
3161 if (vstart >= kElementsCount * kRegistersInvolved) [[unlikely]] {
3162 return;
3163 }
3164 if (vstart == 0) [[likely]] {
3165 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3166 state_->cpu.v[dst + index] = state_->cpu.v[src + index];
3167 }
3168 return;
3169 }
3170 size_t index = vstart / kElementsCount;
3171 SIMD128Register destination{state_->cpu.v[dst + index]};
3172 SIMD128Register source{state_->cpu.v[src + index]};
3173 for (size_t element_index = vstart % kElementsCount; element_index < kElementsCount;
3174 ++element_index) {
3175 destination.Set(source.Get<ElementType>(element_index), element_index);
3176 }
3177 state_->cpu.v[dst + index] = destination.Get<__uint128_t>();
3178 for (index++; index < kRegistersInvolved; ++index) {
3179 state_->cpu.v[dst + index] = state_->cpu.v[src + index];
3180 }
3181 }
3182
3183 template <auto Intrinsic,
3184 typename ElementType,
3185 VectorRegisterGroupMultiplier vlmul,
3186 auto vma,
3187 CsrName... kExtraCsrs>
OpVectorToMaskvv(uint8_t dst,uint8_t src1,uint8_t src2)3188 void OpVectorToMaskvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3189 return OpVectorToMask<Intrinsic,
3190 ElementType,
3191 NumberOfRegistersInvolved(vlmul),
3192 vma,
3193 kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3194 }
3195
3196 template <auto Intrinsic,
3197 typename ElementType,
3198 VectorRegisterGroupMultiplier vlmul,
3199 auto vma,
3200 CsrName... kExtraCsrs>
OpVectorToMaskvx(uint8_t dst,uint8_t src1,ElementType arg2)3201 void OpVectorToMaskvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3202 return OpVectorToMask<Intrinsic,
3203 ElementType,
3204 NumberOfRegistersInvolved(vlmul),
3205 vma,
3206 kExtraCsrs...>(dst, Vec{src1}, arg2);
3207 }
3208
3209 template <auto Intrinsic,
3210 typename ElementType,
3211 size_t kRegistersInvolved,
3212 auto vma,
3213 CsrName... kExtraCsrs,
3214 typename... Args>
OpVectorToMask(uint8_t dst,Args...args)3215 void OpVectorToMask(uint8_t dst, Args... args) {
3216 // All args, except dst must be aligned at kRegistersInvolved amount. We'll merge them
3217 // together and then do a combined check for all of them at once.
3218 if (!IsAligned<kRegistersInvolved>(OrValuesOnlyForType<Vec>(args...))) {
3219 return Undefined();
3220 }
3221 SIMD128Register original_result(state_->cpu.v[dst]);
3222 size_t vstart = GetCsr<CsrName::kVstart>();
3223 size_t vl = GetCsr<CsrName::kVl>();
3224 SetCsr<CsrName::kVstart>(0);
3225 SIMD128Register result_before_vl_masking;
3226 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3227 // vector register group, including that no tail elements are updated with agnostic values.
3228 if (vstart >= vl) [[unlikely]] {
3229 result_before_vl_masking = original_result;
3230 } else {
3231 result_before_vl_masking = CollectBitmaskResult<ElementType, kRegistersInvolved>(
3232 [this, vstart, vl, args...](auto index) {
3233 return Intrinsic(this->GetCsr<kExtraCsrs>()...,
3234 this->GetVectorArgument<ElementType, TailProcessing::kAgnostic, vma>(
3235 args, vstart, vl, index, intrinsics::NoInactiveProcessing{})...);
3236 });
3237 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3238 SIMD128Register mask(state_->cpu.v[0]);
3239 if constexpr (vma == InactiveProcessing::kAgnostic) {
3240 result_before_vl_masking |= ~mask;
3241 } else {
3242 result_before_vl_masking = (mask & result_before_vl_masking) | (original_result & ~mask);
3243 }
3244 }
3245 if (vstart > 0) [[unlikely]] {
3246 const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
3247 result_before_vl_masking =
3248 (original_result & ~start_mask) | (result_before_vl_masking & start_mask);
3249 }
3250 }
3251 const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3252 state_->cpu.v[dst] = (result_before_vl_masking | tail_mask).Get<__uint128_t>();
3253 }
3254
3255 template <auto Intrinsic,
3256 typename ElementType,
3257 VectorRegisterGroupMultiplier vlmul,
3258 TailProcessing vta,
3259 auto vma,
3260 CsrName... kExtraCsrs,
3261 typename... DstMaskType>
OpVectorv(uint8_t dst,uint8_t src1,DstMaskType...dst_mask)3262 void OpVectorv(uint8_t dst, uint8_t src1, DstMaskType... dst_mask) {
3263 return OpVectorv<Intrinsic,
3264 ElementType,
3265 NumberOfRegistersInvolved(vlmul),
3266 vta,
3267 vma,
3268 kExtraCsrs...>(dst, src1, dst_mask...);
3269 }
3270
3271 template <auto Intrinsic,
3272 typename ElementType,
3273 size_t kRegistersInvolved,
3274 TailProcessing vta,
3275 auto vma,
3276 CsrName... kExtraCsrs,
3277 typename... DstMaskType>
OpVectorv(uint8_t dst,uint8_t src,DstMaskType...dst_mask)3278 void OpVectorv(uint8_t dst, uint8_t src, DstMaskType... dst_mask) {
3279 static_assert(sizeof...(dst_mask) <= 1);
3280 if (!IsAligned<kRegistersInvolved>(dst | src | (dst_mask | ... | 0))) {
3281 return Undefined();
3282 }
3283 size_t vstart = GetCsr<CsrName::kVstart>();
3284 size_t vl = GetCsr<CsrName::kVl>();
3285 SetCsr<CsrName::kVstart>(0);
3286 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3287 // vector register group, including that no tail elements are updated with agnostic values.
3288 if (vstart >= vl) [[unlikely]] {
3289 return;
3290 }
3291 auto mask = GetMaskForVectorOperations<vma>();
3292 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3293 SIMD128Register result{state_->cpu.v[dst + index]};
3294 SIMD128Register result_mask;
3295 if constexpr (sizeof...(DstMaskType) == 0) {
3296 result_mask.Set(state_->cpu.v[dst + index]);
3297 } else {
3298 uint8_t dst_mask_unpacked[1] = {dst_mask...};
3299 result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
3300 }
3301 SIMD128Register arg{state_->cpu.v[src + index]};
3302 result =
3303 VectorMasking<ElementType, vta, vma>(result,
3304 std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg)),
3305 result_mask,
3306 vstart,
3307 vl,
3308 index,
3309 mask);
3310 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3311 }
3312 }
3313
3314 template <auto Intrinsic,
3315 typename ElementType,
3316 VectorRegisterGroupMultiplier vlmul,
3317 TailProcessing vta,
3318 auto vma,
3319 CsrName... kExtraCsrs,
3320 auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3321 void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3322 return OpVectorvs<Intrinsic,
3323 ElementType,
3324 NumberOfRegistersInvolved(vlmul),
3325 vta,
3326 vma,
3327 kExtraCsrs...>(dst, src1, src2);
3328 }
3329
3330 template <auto Intrinsic,
3331 typename ElementType,
3332 size_t kRegistersInvolved,
3333 TailProcessing vta,
3334 auto vma,
3335 CsrName... kExtraCsrs,
3336 auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3337 void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3338 if (!IsAligned<kRegistersInvolved>(dst | src1.start_no)) {
3339 return Undefined();
3340 }
3341 size_t vstart = GetCsr<CsrName::kVstart>();
3342 size_t vl = GetCsr<CsrName::kVl>();
3343 if (vstart != 0) {
3344 return Undefined();
3345 }
3346 SetCsr<CsrName::kVstart>(0);
3347 // If vl = 0, no operation is performed and the destination register is not updated.
3348 if (vl == 0) [[unlikely]] {
3349 return;
3350 }
3351 auto mask = GetMaskForVectorOperations<vma>();
3352 ElementType init = SIMD128Register{state_->cpu.v[src2]}.Get<ElementType>(0);
3353 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3354 init = std::get<0>(
3355 Intrinsic(GetCsr<kExtraCsrs>()...,
3356 init,
3357 GetVectorArgument<ElementType, vta, vma>(src1, vstart, vl, index, mask)));
3358 }
3359 SIMD128Register result{state_->cpu.v[dst]};
3360 result.Set(init, 0);
3361 result = std::get<0>(intrinsics::VectorMasking<ElementType, vta>(result, result, 0, 1));
3362 state_->cpu.v[dst] = result.Get<__uint128_t>();
3363 }
3364
3365 template <auto Intrinsic,
3366 typename ElementType,
3367 VectorRegisterGroupMultiplier vlmul,
3368 TailProcessing vta,
3369 auto vma,
3370 CsrName... kExtraCsrs>
OpVectorvv(uint8_t dst,uint8_t src1,uint8_t src2)3371 void OpVectorvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3372 return OpVectorSameWidth<Intrinsic,
3373 ElementType,
3374 NumberOfRegistersInvolved(vlmul),
3375 vta,
3376 vma,
3377 kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3378 }
3379
3380 template <auto Intrinsic,
3381 typename ElementType,
3382 VectorRegisterGroupMultiplier vlmul,
3383 TailProcessing vta,
3384 auto vma,
3385 CsrName... kExtraCsrs>
OpVectorvvv(uint8_t dst,uint8_t src1,uint8_t src2)3386 void OpVectorvvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3387 return OpVectorSameWidth<Intrinsic,
3388 ElementType,
3389 NumberOfRegistersInvolved(vlmul),
3390 vta,
3391 vma,
3392 kExtraCsrs...>(dst, Vec{src1}, Vec{src2}, Vec{dst});
3393 }
3394
3395 template <auto Intrinsic,
3396 typename ElementType,
3397 VectorRegisterGroupMultiplier vlmul,
3398 TailProcessing vta,
3399 auto vma,
3400 CsrName... kExtraCsrs>
OpVectorWidenv(uint8_t dst,uint8_t src)3401 void OpVectorWidenv(uint8_t dst, uint8_t src) {
3402 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3403 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3404 return OpVectorWiden<Intrinsic,
3405 ElementType,
3406 NumRegistersInvolvedForWideOperand(vlmul),
3407 NumberOfRegistersInvolved(vlmul),
3408 vta,
3409 vma,
3410 kExtraCsrs...>(dst, Vec{src});
3411 }
3412 return Undefined();
3413 }
3414
3415 // 2*SEW = SEW op SEW
3416 // Attention: not to confuse with OpVectorWidenwv with 2*SEW = 2*SEW op SEW
3417 template <auto Intrinsic,
3418 typename ElementType,
3419 VectorRegisterGroupMultiplier vlmul,
3420 TailProcessing vta,
3421 auto vma,
3422 CsrName... kExtraCsrs>
OpVectorWidenvv(uint8_t dst,uint8_t src1,uint8_t src2)3423 void OpVectorWidenvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3424 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3425 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3426 return OpVectorWiden<Intrinsic,
3427 ElementType,
3428 NumRegistersInvolvedForWideOperand(vlmul),
3429 NumberOfRegistersInvolved(vlmul),
3430 vta,
3431 vma,
3432 kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3433 }
3434 return Undefined();
3435 }
3436
3437 // 2*SEW = SEW op SEW op 2*SEW
3438 template <auto Intrinsic,
3439 typename ElementType,
3440 VectorRegisterGroupMultiplier vlmul,
3441 TailProcessing vta,
3442 auto vma,
3443 CsrName... kExtraCsrs>
OpVectorWidenvvw(uint8_t dst,uint8_t src1,uint8_t src2)3444 void OpVectorWidenvvw(uint8_t dst, uint8_t src1, uint8_t src2) {
3445 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3446 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3447 return OpVectorWiden<Intrinsic,
3448 ElementType,
3449 NumRegistersInvolvedForWideOperand(vlmul),
3450 NumberOfRegistersInvolved(vlmul),
3451 vta,
3452 vma,
3453 kExtraCsrs...>(dst, Vec{src1}, Vec{src2}, WideVec{dst});
3454 }
3455 return Undefined();
3456 }
3457
3458 // 2*SEW = 2*SEW op SEW
3459 template <auto Intrinsic,
3460 typename ElementType,
3461 VectorRegisterGroupMultiplier vlmul,
3462 TailProcessing vta,
3463 auto vma,
3464 CsrName... kExtraCsrs>
OpVectorWidenwv(uint8_t dst,uint8_t src1,uint8_t src2)3465 void OpVectorWidenwv(uint8_t dst, uint8_t src1, uint8_t src2) {
3466 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3467 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3468 return OpVectorWiden<Intrinsic,
3469 ElementType,
3470 NumRegistersInvolvedForWideOperand(vlmul),
3471 NumberOfRegistersInvolved(vlmul),
3472 vta,
3473 vma,
3474 kExtraCsrs...>(dst, WideVec{src1}, Vec{src2});
3475 }
3476 return Undefined();
3477 }
3478
3479 template <auto Intrinsic,
3480 typename ElementType,
3481 VectorRegisterGroupMultiplier vlmul,
3482 TailProcessing vta,
3483 auto vma,
3484 CsrName... kExtraCsrs>
OpVectorWidenwx(uint8_t dst,uint8_t src1,ElementType arg2)3485 void OpVectorWidenwx(uint8_t dst, uint8_t src1, ElementType arg2) {
3486 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3487 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3488 return OpVectorWiden<Intrinsic,
3489 ElementType,
3490 NumRegistersInvolvedForWideOperand(vlmul),
3491 NumberOfRegistersInvolved(vlmul),
3492 vta,
3493 vma,
3494 kExtraCsrs...>(dst, WideVec{src1}, arg2);
3495 }
3496 return Undefined();
3497 }
3498
3499 template <auto Intrinsic,
3500 typename ElementType,
3501 VectorRegisterGroupMultiplier vlmul,
3502 TailProcessing vta,
3503 auto vma,
3504 CsrName... kExtraCsrs>
OpVectorWidenvx(uint8_t dst,uint8_t src1,ElementType arg2)3505 void OpVectorWidenvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3506 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3507 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3508 return OpVectorWiden<Intrinsic,
3509 ElementType,
3510 NumRegistersInvolvedForWideOperand(vlmul),
3511 NumberOfRegistersInvolved(vlmul),
3512 vta,
3513 vma,
3514 kExtraCsrs...>(dst, Vec{src1}, arg2);
3515 }
3516 return Undefined();
3517 }
3518
3519 template <auto Intrinsic,
3520 typename ElementType,
3521 VectorRegisterGroupMultiplier vlmul,
3522 TailProcessing vta,
3523 auto vma,
3524 CsrName... kExtraCsrs>
OpVectorWidenvxw(uint8_t dst,uint8_t src1,ElementType arg2)3525 void OpVectorWidenvxw(uint8_t dst, uint8_t src1, ElementType arg2) {
3526 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3527 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3528 return OpVectorWiden<Intrinsic,
3529 ElementType,
3530 NumRegistersInvolvedForWideOperand(vlmul),
3531 NumberOfRegistersInvolved(vlmul),
3532 vta,
3533 vma,
3534 kExtraCsrs...>(dst, Vec{src1}, arg2, WideVec{dst});
3535 }
3536 return Undefined();
3537 }
3538
3539 template <auto Intrinsic,
3540 typename ElementType,
3541 size_t kDestRegistersInvolved,
3542 size_t kRegistersInvolved,
3543 TailProcessing vta,
3544 auto vma,
3545 CsrName... kExtraCsrs,
3546 typename... Args>
OpVectorWiden(uint8_t dst,Args...args)3547 void OpVectorWiden(uint8_t dst, Args... args) {
3548 if constexpr (kDestRegistersInvolved == kRegistersInvolved) {
3549 static_assert(kDestRegistersInvolved == 1);
3550 } else {
3551 static_assert(kDestRegistersInvolved == 2 * kRegistersInvolved);
3552 // All normal (narrow) args must be aligned at kRegistersInvolved amount. We'll merge them
3553 // together and then do a combined check for all of them at once.
3554 uint8_t ored_args = OrValuesOnlyForType<Vec>(args...);
3555 // All wide args must be aligned at kRegistersInvolved amount. We'll merge them together and
3556 // then do a combined check for all of them at once.
3557 uint8_t ored_wide_args = OrValuesOnlyForType<WideVec>(args...) | dst;
3558 if (!IsAligned<kDestRegistersInvolved>(ored_wide_args) ||
3559 !IsAligned<kRegistersInvolved>(ored_args)) {
3560 return Undefined();
3561 }
3562 }
3563 // From RISC-V vectors manual: If destination EEW is greater than the source EEW, the source
3564 // EMUL is at least 1, [then overlap is permitted if ] the overlap is in the highest numbered
3565 // part of the destination register group (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a
3566 // source of v0, v2, or v4 is not).
3567 // Here only one forbidden combination is possible because of static_asserts above and we
3568 // detect and reject it.
3569 if (OrResultsOnlyForType<Vec>([dst](auto arg) { return arg.start_no == dst; }, args...)) {
3570 return Undefined();
3571 }
3572 size_t vstart = GetCsr<CsrName::kVstart>();
3573 size_t vl = GetCsr<CsrName::kVl>();
3574 SetCsr<CsrName::kVstart>(0);
3575 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3576 // vector register group, including that no tail elements are updated with agnostic values.
3577 if (vstart >= vl) [[unlikely]] {
3578 return;
3579 }
3580 auto mask = GetMaskForVectorOperations<vma>();
3581 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3582 SIMD128Register result(state_->cpu.v[dst + 2 * index]);
3583 result = VectorMasking<WideType<ElementType>, vta, vma>(
3584 result,
3585 std::get<0>(Intrinsic(
3586 GetCsr<kExtraCsrs>()...,
3587 GetLowVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3588 vstart,
3589 vl,
3590 2 * index,
3591 mask);
3592 state_->cpu.v[dst + 2 * index] = result.Get<__uint128_t>();
3593 if constexpr (kDestRegistersInvolved > 1) { // if lmul is one full register or more
3594 result.Set(state_->cpu.v[dst + 2 * index + 1]);
3595 result = VectorMasking<WideType<ElementType>, vta, vma>(
3596 result,
3597 std::get<0>(Intrinsic(
3598 GetCsr<kExtraCsrs>()...,
3599 GetHighVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3600 vstart,
3601 vl,
3602 2 * index + 1,
3603 mask);
3604 state_->cpu.v[dst + 2 * index + 1] = result.Get<__uint128_t>();
3605 }
3606 }
3607 }
3608
3609 template <auto Intrinsic,
3610 typename ElementType,
3611 VectorRegisterGroupMultiplier vlmul,
3612 TailProcessing vta,
3613 auto vma,
3614 CsrName... kExtraCsrs>
OpVectorvx(uint8_t dst,uint8_t src1,ElementType arg2)3615 void OpVectorvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3616 return OpVectorSameWidth<Intrinsic,
3617 ElementType,
3618 NumberOfRegistersInvolved(vlmul),
3619 vta,
3620 vma,
3621 kExtraCsrs...>(dst, Vec{src1}, arg2);
3622 }
3623
3624 template <auto Intrinsic,
3625 typename ElementType,
3626 size_t kRegistersInvolved,
3627 TailProcessing vta,
3628 auto vma,
3629 CsrName... kExtraCsrs,
3630 typename... Args>
OpVectorSameWidth(uint8_t dst,Args...args)3631 void OpVectorSameWidth(uint8_t dst, Args... args) {
3632 // All args must be aligned at kRegistersInvolved amount. We'll merge them
3633 // together and then do a combined check for all of them at once.
3634 if (!IsAligned<kRegistersInvolved>(OrValuesOnlyForType<Vec>(args...) | dst)) {
3635 return Undefined();
3636 }
3637 size_t vstart = GetCsr<CsrName::kVstart>();
3638 size_t vl = GetCsr<CsrName::kVl>();
3639 SetCsr<CsrName::kVstart>(0);
3640 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3641 // vector register group, including that no tail elements are updated with agnostic values.
3642 if (vstart >= vl) [[unlikely]] {
3643 return;
3644 }
3645 auto mask = GetMaskForVectorOperations<vma>();
3646 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3647 SIMD128Register result(state_->cpu.v[dst + index]);
3648 result = VectorMasking<ElementType, vta, vma>(
3649 result,
3650 std::get<0>(Intrinsic(
3651 GetCsr<kExtraCsrs>()...,
3652 GetVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3653 vstart,
3654 vl,
3655 index,
3656 mask);
3657 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3658 }
3659 }
3660
3661 template <auto Intrinsic,
3662 typename TargetElementType,
3663 VectorRegisterGroupMultiplier vlmul,
3664 TailProcessing vta,
3665 auto vma,
3666 CsrName... kExtraCsrs>
OpVectorNarroww(uint8_t dst,uint8_t src)3667 void OpVectorNarroww(uint8_t dst, uint8_t src) {
3668 if constexpr (sizeof(TargetElementType) < sizeof(Int64) &&
3669 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3670 return OpVectorNarrow<Intrinsic,
3671 TargetElementType,
3672 NumberOfRegistersInvolved(vlmul),
3673 NumRegistersInvolvedForWideOperand(vlmul),
3674 vta,
3675 vma,
3676 kExtraCsrs...>(dst, WideVec{src});
3677 }
3678 return Undefined();
3679 }
3680
3681 // SEW = 2*SEW op SEW
3682 template <auto Intrinsic,
3683 typename ElementType,
3684 VectorRegisterGroupMultiplier vlmul,
3685 TailProcessing vta,
3686 auto vma,
3687 CsrName... kExtraCsrs>
OpVectorNarrowwx(uint8_t dst,uint8_t src1,ElementType arg2)3688 void OpVectorNarrowwx(uint8_t dst, uint8_t src1, ElementType arg2) {
3689 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3690 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3691 return OpVectorNarrow<Intrinsic,
3692 ElementType,
3693 NumberOfRegistersInvolved(vlmul),
3694 NumRegistersInvolvedForWideOperand(vlmul),
3695 vta,
3696 vma,
3697 kExtraCsrs...>(dst, WideVec{src1}, arg2);
3698 }
3699 return Undefined();
3700 }
3701
3702 // SEW = 2*SEW op SEW
3703 template <auto Intrinsic,
3704 typename ElementType,
3705 VectorRegisterGroupMultiplier vlmul,
3706 TailProcessing vta,
3707 auto vma,
3708 CsrName... kExtraCsrs>
OpVectorNarrowwv(uint8_t dst,uint8_t src1,uint8_t src2)3709 void OpVectorNarrowwv(uint8_t dst, uint8_t src1, uint8_t src2) {
3710 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3711 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3712 return OpVectorNarrow<Intrinsic,
3713 ElementType,
3714 NumberOfRegistersInvolved(vlmul),
3715 NumRegistersInvolvedForWideOperand(vlmul),
3716 vta,
3717 vma,
3718 kExtraCsrs...>(dst, WideVec{src1}, Vec{src2});
3719 }
3720 return Undefined();
3721 }
3722
3723 template <auto Intrinsic,
3724 typename ElementType,
3725 size_t kRegistersInvolved,
3726 size_t kWideSrcRegistersInvolved,
3727 TailProcessing vta,
3728 auto vma,
3729 CsrName... kExtraCsrs,
3730 typename... Args>
OpVectorNarrow(uint8_t dst,Args...args)3731 void OpVectorNarrow(uint8_t dst, Args... args) {
3732 if constexpr (kWideSrcRegistersInvolved == kRegistersInvolved) {
3733 static_assert(kWideSrcRegistersInvolved == 1);
3734 } else {
3735 // All normal (narrow) args must be aligned at kRegistersInvolved amount. We'll merge them
3736 // together and then do a combined check for all of them at once.
3737 uint8_t ored_args = OrValuesOnlyForType<Vec>(args...) | dst;
3738 // All wide args must be aligned at kWideSrcRegistersInvolved amount. We'll merge them
3739 // together and then do a combined check for all of them at once.
3740 uint8_t ored_wide_args = OrValuesOnlyForType<WideVec>(args...);
3741 if (!IsAligned<kWideSrcRegistersInvolved>(ored_wide_args) ||
3742 !IsAligned<kRegistersInvolved>(ored_args)) {
3743 return Undefined();
3744 }
3745 static_assert(kWideSrcRegistersInvolved == 2 * kRegistersInvolved);
3746 // From RISC-V vectors manual: If destination EEW is smaller than the source EEW, [then
3747 // overlap is permitted if] the overlap is in the lowest-numbered part of the source register
3748 // group (e.g., when LMUL=1, vnsrl.wi v0, v0, 3 is legal, but a destination of v1 is not).
3749 // We only have one possible invalid value here because of alignment requirements.
3750 if (OrResultsOnlyForType<Vec>(
3751 [dst](auto arg) { return arg.start_no == dst + kRegistersInvolved; }, args...)) {
3752 return Undefined();
3753 }
3754 }
3755 size_t vstart = GetCsr<CsrName::kVstart>();
3756 size_t vl = GetCsr<CsrName::kVl>();
3757 SetCsr<CsrName::kVstart>(0);
3758 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3759 // vector register group, including that no tail elements are updated with agnostic values.
3760 if (vstart >= vl) [[unlikely]] {
3761 return;
3762 }
3763 auto mask = GetMaskForVectorOperations<vma>();
3764 for (size_t index = 0; index < kRegistersInvolved; index++) {
3765 SIMD128Register orig_result(state_->cpu.v[dst + index]);
3766 SIMD128Register intrinsic_result = std::get<0>(
3767 Intrinsic(GetCsr<kExtraCsrs>()...,
3768 GetLowVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...));
3769 if constexpr (kWideSrcRegistersInvolved > 1) {
3770 SIMD128Register result_high = std::get<0>(Intrinsic(
3771 GetCsr<kExtraCsrs>()...,
3772 GetHighVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...));
3773 intrinsic_result = std::get<0>(
3774 intrinsics::VMergeBottomHalfToTop<ElementType>(intrinsic_result, result_high));
3775 }
3776 auto result = VectorMasking<ElementType, vta, vma>(
3777 orig_result, intrinsic_result, vstart, vl, index, mask);
3778 state_->cpu.v[dst + index] = result.template Get<__uint128_t>();
3779 }
3780 }
3781
3782 template <auto Intrinsic,
3783 typename DestElementType,
3784 const uint8_t kFactor,
3785 VectorRegisterGroupMultiplier vlmul,
3786 TailProcessing vta,
3787 auto vma>
OpVectorVXUnary0(uint8_t dst,uint8_t src)3788 void OpVectorVXUnary0(uint8_t dst, uint8_t src) {
3789 static_assert(kFactor == 2 || kFactor == 4 || kFactor == 8);
3790 constexpr size_t kDestRegistersInvolved = NumberOfRegistersInvolved(vlmul);
3791 constexpr size_t kSourceRegistersInvolved = (kDestRegistersInvolved / kFactor) ?: 1;
3792 if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSourceRegistersInvolved>(src)) {
3793 return Undefined();
3794 }
3795 size_t vstart = GetCsr<CsrName::kVstart>();
3796 size_t vl = GetCsr<CsrName::kVl>();
3797 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3798 // vector register group, including that no tail elements are updated with agnostic values.
3799 if (vstart >= vl) [[unlikely]] {
3800 SetCsr<CsrName::kVstart>(0);
3801 return;
3802 }
3803 auto mask = GetMaskForVectorOperations<vma>();
3804 for (size_t dst_index = 0; dst_index < kDestRegistersInvolved; dst_index++) {
3805 size_t src_index = dst_index / kFactor;
3806 size_t src_elem = dst_index % kFactor;
3807 SIMD128Register result{state_->cpu.v[dst + dst_index]};
3808 SIMD128Register arg{state_->cpu.v[src + src_index] >> ((128 / kFactor) * src_elem)};
3809
3810 result = VectorMasking<DestElementType, vta, vma>(
3811 result, std::get<0>(Intrinsic(arg)), vstart, vl, dst_index, mask);
3812 state_->cpu.v[dst + dst_index] = result.Get<__uint128_t>();
3813 }
3814 SetCsr<CsrName::kVstart>(0);
3815 }
3816
3817 template <auto Intrinsic,
3818 typename ElementType,
3819 VectorRegisterGroupMultiplier vlmul,
3820 TailProcessing vta,
3821 auto vma,
3822 CsrName... kExtraCsrs>
OpVectorvxv(uint8_t dst,uint8_t src1,ElementType arg2)3823 void OpVectorvxv(uint8_t dst, uint8_t src1, ElementType arg2) {
3824 return OpVectorSameWidth<Intrinsic,
3825 ElementType,
3826 NumberOfRegistersInvolved(vlmul),
3827 vta,
3828 vma,
3829 kExtraCsrs...>(dst, Vec{src1}, arg2, Vec{dst});
3830 }
3831
3832 template <auto Intrinsic,
3833 typename ElementType,
3834 VectorRegisterGroupMultiplier vlmul,
3835 TailProcessing vta,
3836 auto vma,
3837 typename... DstMaskType>
OpVectorx(uint8_t dst,ElementType arg2,DstMaskType...dst_mask)3838 void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
3839 return OpVectorx<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
3840 dst, arg2, dst_mask...);
3841 }
3842
3843 template <auto Intrinsic,
3844 typename ElementType,
3845 size_t kRegistersInvolved,
3846 TailProcessing vta,
3847 auto vma,
3848 typename... DstMaskType>
OpVectorx(uint8_t dst,ElementType arg2,DstMaskType...dst_mask)3849 void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
3850 static_assert(sizeof...(dst_mask) <= 1);
3851 if (!IsAligned<kRegistersInvolved>(dst | (dst_mask | ... | 0))) {
3852 return Undefined();
3853 }
3854 size_t vstart = GetCsr<CsrName::kVstart>();
3855 size_t vl = GetCsr<CsrName::kVl>();
3856 SetCsr<CsrName::kVstart>(0);
3857 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3858 // vector register group, including that no tail elements are updated with agnostic values.
3859 if (vstart >= vl) [[unlikely]] {
3860 return;
3861 }
3862 auto mask = GetMaskForVectorOperations<vma>();
3863 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3864 SIMD128Register result(state_->cpu.v[dst + index]);
3865 SIMD128Register result_mask;
3866 if constexpr (sizeof...(DstMaskType) == 0) {
3867 result_mask.Set(state_->cpu.v[dst + index]);
3868 } else {
3869 uint8_t dst_mask_unpacked[1] = {dst_mask...};
3870 result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
3871 }
3872 result = VectorMasking<ElementType, vta, vma>(
3873 result, std::get<0>(Intrinsic(arg2)), result_mask, vstart, vl, index, mask);
3874 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3875 }
3876 }
3877
3878 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslideup(uint8_t dst,uint8_t src,Register offset)3879 void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
3880 return OpVectorslideup<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
3881 dst, src, offset);
3882 }
3883
3884 template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorslideup(uint8_t dst,uint8_t src,Register offset)3885 void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
3886 constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
3887 if (!IsAligned<kRegistersInvolved>(dst | src)) {
3888 return Undefined();
3889 }
3890 // Source and destination must not intersect.
3891 if (dst < (src + kRegistersInvolved) && src < (dst + kRegistersInvolved)) {
3892 return Undefined();
3893 }
3894 size_t vstart = GetCsr<CsrName::kVstart>();
3895 size_t vl = GetCsr<CsrName::kVl>();
3896 SetCsr<CsrName::kVstart>(0);
3897 if (vstart >= vl) [[unlikely]] {
3898 // From 16.3: For all of the [slide instructions], if vstart >= vl, the
3899 // instruction performs no operation and leaves the destination vector
3900 // register unchanged.
3901 return;
3902 }
3903 auto mask = GetMaskForVectorOperations<vma>();
3904 // The slideup operation leaves Elements 0 through MAX(vstart, OFFSET) unchanged.
3905 //
3906 // From 16.3.1: Destination elements OFFSET through vl-1 are written if
3907 // unmasked and if OFFSET < vl.
3908 // However if OFFSET > vl, we still need to apply the tail policy (as
3909 // clarified in https://github.com/riscv/riscv-v-spec/issues/263). Given
3910 // that OFFSET could be well past vl we start at vl rather than OFFSET in
3911 // that case.
3912 const size_t start_elem_index = std::min(std::max(vstart, offset), vl);
3913 for (size_t index = start_elem_index / kElementsPerRegister; index < kRegistersInvolved;
3914 ++index) {
3915 SIMD128Register result(state_->cpu.v[dst + index]);
3916
3917 // Arguments falling before the input group correspond to the first offset-amount
3918 // result elements, which must remain undisturbed. We zero-initialize them here,
3919 // but their values are eventually ignored by vstart masking in VectorMasking.
3920 ssize_t first_arg_disp = index - 1 - offset / kElementsPerRegister;
3921 SIMD128Register arg1 =
3922 (first_arg_disp < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp];
3923 SIMD128Register arg2 =
3924 (first_arg_disp + 1 < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp + 1];
3925
3926 result =
3927 VectorMasking<ElementType, vta, vma>(result,
3928 std::get<0>(intrinsics::VectorSlideUp<ElementType>(
3929 offset % kElementsPerRegister, arg1, arg2)),
3930 start_elem_index,
3931 vl,
3932 index,
3933 mask);
3934 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3935 }
3936 }
3937
3938 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslide1up(uint8_t dst,uint8_t src,ElementType xval)3939 void OpVectorslide1up(uint8_t dst, uint8_t src, ElementType xval) {
3940 // Save the vstart before it's reset by vslideup.
3941 size_t vstart = GetCsr<CsrName::kVstart>();
3942 // Slide all the elements by one.
3943 OpVectorslideup<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src, 1);
3944 if (exception_raised_) {
3945 return;
3946 }
3947 if (vstart > 0) {
3948 // First element is not affected and should remain untouched.
3949 return;
3950 }
3951
3952 // From 16.3.3: places the x register argument at location 0 of the
3953 // destination vector register group provided that element 0 is active,
3954 // otherwise the destination element update follows the current mask
3955 // agnostic/undisturbed policy.
3956 if constexpr (std::is_same_v<decltype(vma), intrinsics::InactiveProcessing>) {
3957 auto mask = GetMaskForVectorOperations<vma>();
3958 if (!(mask.template Get<uint8_t>(0) & 0x1)) {
3959 // The first element is masked. OpVectorslideup already applied the proper masking to it.
3960 return;
3961 }
3962 }
3963
3964 SIMD128Register result = state_->cpu.v[dst];
3965 result.Set(xval, 0);
3966 state_->cpu.v[dst] = result.Get<__uint128_t>();
3967 }
3968
3969 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslidedown(uint8_t dst,uint8_t src,Register offset)3970 void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
3971 return OpVectorslidedown<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
3972 dst, src, offset);
3973 }
3974
3975 template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorslidedown(uint8_t dst,uint8_t src,Register offset)3976 void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
3977 constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
3978 if (!IsAligned<kRegistersInvolved>(dst | src)) {
3979 return Undefined();
3980 }
3981 size_t vstart = GetCsr<CsrName::kVstart>();
3982 size_t vl = GetCsr<CsrName::kVl>();
3983 SetCsr<CsrName::kVstart>(0);
3984 if (vstart >= vl) [[unlikely]] {
3985 // From 16.3: For all of the [slide instructions], if vstart >= vl, the
3986 // instruction performs no operation and leaves the destination vector
3987 // register unchanged.
3988 return;
3989 }
3990 auto mask = GetMaskForVectorOperations<vma>();
3991 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3992 SIMD128Register result(state_->cpu.v[dst + index]);
3993
3994 size_t first_arg_disp = index + offset / kElementsPerRegister;
3995 SIMD128Register arg1 = (first_arg_disp >= kRegistersInvolved)
3996 ? SIMD128Register{0}
3997 : state_->cpu.v[src + first_arg_disp];
3998 SIMD128Register arg2 = (first_arg_disp + 1 >= kRegistersInvolved)
3999 ? SIMD128Register{0}
4000 : state_->cpu.v[src + first_arg_disp + 1];
4001
4002 result =
4003 VectorMasking<ElementType, vta, vma>(result,
4004 std::get<0>(intrinsics::VectorSlideDown<ElementType>(
4005 offset % kElementsPerRegister, arg1, arg2)),
4006 vstart,
4007 vl,
4008 index,
4009 mask);
4010 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
4011 }
4012 }
4013
4014 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslide1down(uint8_t dst,uint8_t src,ElementType xval)4015 void OpVectorslide1down(uint8_t dst, uint8_t src, ElementType xval) {
4016 constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
4017 const size_t vl = GetCsr<CsrName::kVl>();
4018
4019 // From 16.3.4: ... places the x register argument at location vl-1 in the
4020 // destination vector register, provided that element vl-1 is active,
4021 // otherwise the destination element is **unchanged** (emphasis added.)
4022 //
4023 // This means that element at vl-1 would not follow the Mask Agnostic policy
4024 // and would stay Unchanged when inactive. So we need to undo just this one
4025 // element if using agnostic masking.
4026 ElementType last_elem_value = xval;
4027 const size_t last_elem_register = (vl - 1) / kElementsPerRegister;
4028 const size_t last_elem_within_reg_pos = (vl - 1) % kElementsPerRegister;
4029 bool set_last_element = true;
4030 if constexpr (std::is_same_v<decltype(vma), intrinsics::InactiveProcessing>) {
4031 auto mask = GetMaskForVectorOperations<vma>();
4032 auto [mask_bits] =
4033 intrinsics::MaskForRegisterInSequence<ElementType>(mask, last_elem_register);
4034 using MaskType = decltype(mask_bits);
4035 if ((static_cast<MaskType::BaseType>(mask_bits) & (1 << last_elem_within_reg_pos)) == 0) {
4036 if constexpr (vma == intrinsics::InactiveProcessing::kUndisturbed) {
4037 // Element is inactive and the undisturbed policy will be followed,
4038 // just let Opvectorslidedown handle everything.
4039 set_last_element = false;
4040 } else {
4041 // Element is inactive and the agnostic policy will be followed, get
4042 // the original value to restore before it's changed by
4043 // the agnostic policy.
4044 SIMD128Register original = state_->cpu.v[dst + last_elem_register];
4045 last_elem_value = original.Get<ElementType>(last_elem_within_reg_pos);
4046 }
4047 }
4048 }
4049
4050 // Slide all the elements by one.
4051 OpVectorslidedown<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src, 1);
4052 if (exception_raised_) {
4053 return;
4054 }
4055 if (!set_last_element) {
4056 return;
4057 }
4058
4059 SIMD128Register result = state_->cpu.v[dst + last_elem_register];
4060 result.Set(last_elem_value, last_elem_within_reg_pos);
4061 state_->cpu.v[dst + last_elem_register] = result.Get<__uint128_t>();
4062 }
4063
4064 // Helper function needed to generate bitmak result from non-bitmask inputs.
4065 // We are processing between 1 and 8 registers here and each register produces between 2 bits
4066 // (for 64 bit inputs) and 16 bits (for 8 bit inputs) bitmasks which are then combined into
4067 // final result (between 2 and 128 bits long).
4068 // Note that we are not handling tail here! These bits remain undefined and should be handled
4069 // later.
4070 // TODO(b/317757595): Add separate tests to verify the logic.
4071 template <typename ElementType, size_t kRegistersInvolved, typename Intrinsic>
CollectBitmaskResult(Intrinsic intrinsic)4072 SIMD128Register CollectBitmaskResult(Intrinsic intrinsic) {
4073 // We employ two distinct tactics to handle all possibilities:
4074 // 1. For 8bit/16bit types we get full UInt8/UInt16 result and thus use SIMD128Register.Set.
4075 // 2. For 32bit/64bit types we only get 2bit or 4bit from each call and thus need to use
4076 // shifts to accumulate the result.
4077 // But since each of up to 8 results is at most 4bits total bitmask is 32bit (or less).
4078 std::conditional_t<sizeof(ElementType) < sizeof(UInt32), SIMD128Register, UInt32>
4079 bitmask_result{};
4080 for (UInt32 index = UInt32{0}; index < UInt32(kRegistersInvolved); index += UInt32{1}) {
4081 const auto [raw_result] =
4082 intrinsics::SimdMaskToBitMask<ElementType>(std::get<0>(intrinsic(index)));
4083 if constexpr (sizeof(ElementType) < sizeof(Int32)) {
4084 bitmask_result.Set(raw_result, index);
4085 } else {
4086 constexpr UInt32 kElemNum =
4087 UInt32{static_cast<uint32_t>((sizeof(SIMD128Register) / sizeof(ElementType)))};
4088 bitmask_result |= UInt32(UInt8(raw_result)) << (index * kElemNum);
4089 }
4090 }
4091 return SIMD128Register(bitmask_result);
4092 }
4093
Nop()4094 void Nop() {}
4095
Undefined()4096 void Undefined() {
4097 UndefinedInsn(GetInsnAddr());
4098 // If there is a guest handler registered for SIGILL we'll delay its processing until the next
4099 // sync point (likely the main dispatching loop) due to enabled pending signals. Thus we must
4100 // ensure that insn_addr isn't automatically advanced in FinalizeInsn.
4101 exception_raised_ = true;
4102 }
4103
4104 //
4105 // Guest state getters/setters.
4106 //
4107
GetReg(uint8_t reg)4108 Register GetReg(uint8_t reg) const {
4109 CheckRegIsValid(reg);
4110 return state_->cpu.x[reg];
4111 }
4112
GetRegOrZero(uint8_t reg)4113 Register GetRegOrZero(uint8_t reg) { return reg == 0 ? 0 : GetReg(reg); }
4114
SetReg(uint8_t reg,Register value)4115 void SetReg(uint8_t reg, Register value) {
4116 if (exception_raised_) {
4117 // Do not produce side effects.
4118 return;
4119 }
4120 CheckRegIsValid(reg);
4121 state_->cpu.x[reg] = value;
4122 }
4123
SetRegOrIgnore(uint8_t reg,Register value)4124 void SetRegOrIgnore(uint8_t reg, Register value) {
4125 if (reg != 0) {
4126 SetReg(reg, value);
4127 }
4128 }
4129
GetFpReg(uint8_t reg)4130 FpRegister GetFpReg(uint8_t reg) const {
4131 CheckFpRegIsValid(reg);
4132 return state_->cpu.f[reg];
4133 }
4134
4135 template <typename FloatType>
4136 FpRegister GetFRegAndUnboxNan(uint8_t reg);
4137
4138 template <typename FloatType>
4139 void NanBoxAndSetFpReg(uint8_t reg, FpRegister value);
4140
4141 //
4142 // Various helper methods.
4143 //
4144
4145 template <CsrName kName>
GetCsr()4146 [[nodiscard]] Register GetCsr() const {
4147 return state_->cpu.*CsrFieldAddr<kName>;
4148 }
4149
4150 template <CsrName kName>
SetCsr(Register arg)4151 void SetCsr(Register arg) {
4152 if (exception_raised_) {
4153 return;
4154 }
4155 state_->cpu.*CsrFieldAddr<kName> = arg & kCsrMask<kName>;
4156 }
4157
GetImm(uint64_t imm)4158 [[nodiscard]] uint64_t GetImm(uint64_t imm) const { return imm; }
4159
Copy(Register value)4160 [[nodiscard]] Register Copy(Register value) const { return value; }
4161
GetInsnAddr()4162 [[nodiscard]] GuestAddr GetInsnAddr() const { return state_->cpu.insn_addr; }
4163
FinalizeInsn(uint8_t insn_len)4164 void FinalizeInsn(uint8_t insn_len) {
4165 if (!branch_taken_ && !exception_raised_) {
4166 state_->cpu.insn_addr += insn_len;
4167 }
4168 }
4169
4170 #include "berberis/intrinsics/interpreter_intrinsics_hooks-inl.h"
4171
4172 private:
4173 template <typename DataType>
Load(const void * ptr)4174 Register Load(const void* ptr) {
4175 static_assert(std::is_integral_v<DataType>);
4176 CHECK(!exception_raised_);
4177 FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
4178 if (result.is_fault) {
4179 exception_raised_ = true;
4180 return {};
4181 }
4182 return static_cast<DataType>(result.value);
4183 }
4184
4185 template <typename DataType>
Store(void * ptr,uint64_t data)4186 void Store(void* ptr, uint64_t data) {
4187 static_assert(std::is_integral_v<DataType>);
4188 CHECK(!exception_raised_);
4189 exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
4190 }
4191
CheckShamtIsValid(int8_t shamt)4192 void CheckShamtIsValid(int8_t shamt) const {
4193 CHECK_GE(shamt, 0);
4194 CHECK_LT(shamt, 64);
4195 }
4196
CheckShamt32IsValid(int8_t shamt)4197 void CheckShamt32IsValid(int8_t shamt) const {
4198 CHECK_GE(shamt, 0);
4199 CHECK_LT(shamt, 32);
4200 }
4201
CheckRegIsValid(uint8_t reg)4202 void CheckRegIsValid(uint8_t reg) const {
4203 CHECK_GT(reg, 0u);
4204 CHECK_LE(reg, std::size(state_->cpu.x));
4205 }
4206
CheckFpRegIsValid(uint8_t reg)4207 void CheckFpRegIsValid(uint8_t reg) const { CHECK_LT(reg, std::size(state_->cpu.f)); }
4208
4209 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4210 SIMD128Register GetHighVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4211 size_t /*vstart*/,
4212 size_t /*vl*/,
4213 size_t index,
4214 MaskType /*mask*/) {
4215 return std::get<0>(intrinsics::VMovTopHalfToBottom<ElementType>(
4216 SIMD128Register{state_->cpu.v[src.start_no + index]}));
4217 }
4218
4219 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4220 SIMD128Register GetHighVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,
4221 size_t /*vstart*/,
4222 size_t /*vl*/,
4223 size_t index,
4224 MaskType /*mask*/) {
4225 return SIMD128Register{state_->cpu.v[src.start_no + 2 * index + 1]};
4226 }
4227
4228 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4229 ElementType GetHighVectorArgument(ElementType arg,
4230 size_t /*vstart*/,
4231 size_t /*vl*/,
4232 size_t /*index*/,
4233 MaskType /*mask*/) {
4234 return arg;
4235 }
4236
4237 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4238 SIMD128Register GetLowVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4239 size_t /*vstart*/,
4240 size_t /*vl*/,
4241 size_t index,
4242 MaskType /*mask*/) {
4243 return SIMD128Register{state_->cpu.v[src.start_no + index]};
4244 }
4245
4246 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4247 SIMD128Register GetLowVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,
4248 size_t /*vstart*/,
4249 size_t /*vl*/,
4250 size_t index,
4251 MaskType /*mask*/) {
4252 return SIMD128Register{state_->cpu.v[src.start_no + 2 * index]};
4253 }
4254
4255 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4256 ElementType GetLowVectorArgument(ElementType arg,
4257 size_t /*vstart*/,
4258 size_t /*vl*/,
4259 size_t /*index*/,
4260 MaskType /*mask*/) {
4261 return arg;
4262 }
4263
4264 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4265 SIMD128Register GetVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4266 size_t /*vstart*/,
4267 size_t /*vl*/,
4268 size_t index,
4269 MaskType /*mask*/) {
4270 return SIMD128Register{state_->cpu.v[src.start_no + index]};
4271 }
4272
4273 template <typename ElementType,
4274 TailProcessing vta,
4275 auto vma,
4276 typename MaskType,
4277 auto kDefaultElement>
GetVectorArgument(Vec<kDefaultElement> src,size_t vstart,size_t vl,size_t index,MaskType mask)4278 SIMD128Register GetVectorArgument(Vec<kDefaultElement> src,
4279 size_t vstart,
4280 size_t vl,
4281 size_t index,
4282 MaskType mask) {
4283 return VectorMasking<kDefaultElement, vta, vma>(
4284 SIMD128Register{state_->cpu.v[src.start_no + index]}, vstart, vl, index, mask);
4285 }
4286
4287 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4288 ElementType GetVectorArgument(ElementType arg,
4289 size_t /*vstart*/,
4290 size_t /*vl*/,
4291 size_t /*index*/,
4292 MaskType /*mask*/) {
4293 return arg;
4294 }
4295
4296 template <bool kUseMasking>
4297 std::conditional_t<kUseMasking, SIMD128Register, intrinsics::NoInactiveProcessing>
GetMaskForVectorOperationsIfNeeded()4298 GetMaskForVectorOperationsIfNeeded() {
4299 if constexpr (kUseMasking) {
4300 return {state_->cpu.v[0]};
4301 } else {
4302 return intrinsics::NoInactiveProcessing{};
4303 }
4304 }
4305
4306 template <auto vma>
4307 std::conditional_t<std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
4308 intrinsics::NoInactiveProcessing,
4309 SIMD128Register>
GetMaskForVectorOperations()4310 GetMaskForVectorOperations() {
4311 return GetMaskForVectorOperationsIfNeeded<
4312 !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>();
4313 }
4314
4315 template <auto kDefaultElement, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register result,size_t vstart,size_t vl,size_t index,MaskType mask)4316 SIMD128Register VectorMasking(SIMD128Register result,
4317 size_t vstart,
4318 size_t vl,
4319 size_t index,
4320 MaskType mask) {
4321 return std::get<0>(intrinsics::VectorMasking<kDefaultElement, vta, vma>(
4322 result,
4323 vstart - index * (sizeof(SIMD128Register) / sizeof(kDefaultElement)),
4324 vl - index * (sizeof(SIMD128Register) / sizeof(kDefaultElement)),
4325 std::get<0>(
4326 intrinsics::MaskForRegisterInSequence<decltype(kDefaultElement)>(mask, index))));
4327 }
4328
4329 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,size_t vstart,size_t vl,size_t index,MaskType mask)4330 SIMD128Register VectorMasking(SIMD128Register dest,
4331 SIMD128Register result,
4332 size_t vstart,
4333 size_t vl,
4334 size_t index,
4335 MaskType mask) {
4336 return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
4337 dest,
4338 result,
4339 vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4340 vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4341 std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
4342 }
4343
4344 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,SIMD128Register result_mask,size_t vstart,size_t vl,size_t index,MaskType mask)4345 SIMD128Register VectorMasking(SIMD128Register dest,
4346 SIMD128Register result,
4347 SIMD128Register result_mask,
4348 size_t vstart,
4349 size_t vl,
4350 size_t index,
4351 MaskType mask) {
4352 return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
4353 dest,
4354 result,
4355 result_mask,
4356 vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4357 vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4358 std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
4359 }
4360
4361 template <template <auto> typename ProcessType,
4362 auto kLambda =
4363 [](auto packaged_value) {
4364 auto [unpacked_value] = packaged_value;
4365 return unpacked_value;
4366 },
4367 auto kDefaultValue = false,
4368 typename... Args>
4369 [[nodiscard]] static constexpr auto OrValuesOnlyForType(Args... args) {
4370 return OrResultsOnlyForType<ProcessType, kDefaultValue>(kLambda, args...);
4371 }
4372
4373 template <template <auto> typename ProcessTemplateType,
4374 auto kDefaultValue = false,
4375 typename Lambda,
4376 typename... Args>
OrResultsOnlyForType(Lambda lambda,Args...args)4377 [[nodiscard]] static constexpr auto OrResultsOnlyForType(Lambda lambda, Args... args) {
4378 #pragma GCC diagnostic push
4379 #pragma GCC diagnostic ignored "-Wbitwise-instead-of-logical"
4380 return ([lambda](auto arg) {
4381 if constexpr (IsTypeTemplateOf<std::decay_t<decltype(arg)>, ProcessTemplateType>) {
4382 return lambda(arg);
4383 } else {
4384 return kDefaultValue;
4385 }
4386 }(args) |
4387 ...);
4388 #pragma GCC diagnostic pop
4389 }
4390
4391 template <template <auto> typename ProcessTemplateType, typename Lambda, typename... Args>
ProcessOnlyForType(Lambda lambda,Args...args)4392 static constexpr void ProcessOnlyForType(Lambda lambda, Args... args) {
4393 (
4394 [lambda](auto arg) {
4395 if constexpr (IsTypeTemplateOf<std::decay_t<decltype(arg)>, ProcessTemplateType>) {
4396 lambda(arg);
4397 }
4398 }(args),
4399 ...);
4400 }
4401
4402 ThreadState* state_;
4403 bool branch_taken_;
4404 // This flag is set by illegal instructions and faulted memory accesses. The former must always
4405 // stop the playback of the current instruction, so we don't need to do anything special. The
4406 // latter may result in having more operations with side effects called before the end of the
4407 // current instruction:
4408 // Load (faulted) -> SetReg
4409 // LoadFp (faulted) -> NanBoxAndSetFpReg
4410 // If an exception is raised before these operations, we skip them. For all other operations with
4411 // side-effects we check that this flag is never raised.
4412 bool exception_raised_;
4413 };
4414
4415 template <>
4416 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kCycle>() const {
4417 return CPUClockCount();
4418 }
4419
4420 template <>
4421 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFCsr>() const {
4422 return FeGetExceptions() | (state_->cpu.frm << 5);
4423 }
4424
4425 template <>
4426 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFFlags>() const {
4427 return FeGetExceptions();
4428 }
4429
4430 template <>
4431 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVlenb>() const {
4432 return 16;
4433 }
4434
4435 template <>
4436 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxrm>() const {
4437 return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11;
4438 }
4439
4440 template <>
4441 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxsat>() const {
4442 return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> >> 2;
4443 }
4444
4445 template <>
4446 void inline Interpreter::SetCsr<CsrName::kFCsr>(Register arg) {
4447 CHECK(!exception_raised_);
4448 FeSetExceptions(arg & 0b1'1111);
4449 arg = (arg >> 5) & kCsrMask<CsrName::kFrm>;
4450 state_->cpu.frm = arg;
4451 FeSetRound(arg);
4452 }
4453
4454 template <>
4455 void inline Interpreter::SetCsr<CsrName::kFFlags>(Register arg) {
4456 CHECK(!exception_raised_);
4457 FeSetExceptions(arg & 0b1'1111);
4458 }
4459
4460 template <>
4461 void inline Interpreter::SetCsr<CsrName::kFrm>(Register arg) {
4462 CHECK(!exception_raised_);
4463 arg &= kCsrMask<CsrName::kFrm>;
4464 state_->cpu.frm = arg;
4465 FeSetRound(arg);
4466 }
4467
4468 template <>
4469 void inline Interpreter::SetCsr<CsrName::kVxrm>(Register arg) {
4470 CHECK(!exception_raised_);
4471 state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
4472 (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b100) | (arg & 0b11);
4473 }
4474
4475 template <>
4476 void inline Interpreter::SetCsr<CsrName::kVxsat>(Register arg) {
4477 CHECK(!exception_raised_);
4478 state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
4479 (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11) | ((arg & 0b1) << 2);
4480 }
4481
4482 template <>
4483 [[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float32>(
4484 uint8_t reg) {
4485 CheckFpRegIsValid(reg);
4486 FpRegister value = state_->cpu.f[reg];
4487 return UnboxNan<Float32>(value);
4488 }
4489
4490 template <>
4491 [[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float64>(
4492 uint8_t reg) {
4493 CheckFpRegIsValid(reg);
4494 return state_->cpu.f[reg];
4495 }
4496
4497 template <>
4498 void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float32>(uint8_t reg, FpRegister value) {
4499 if (exception_raised_) {
4500 // Do not produce side effects.
4501 return;
4502 }
4503 CheckFpRegIsValid(reg);
4504 state_->cpu.f[reg] = NanBox<Float32>(value);
4505 }
4506
4507 template <>
4508 void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float64>(uint8_t reg, FpRegister value) {
4509 if (exception_raised_) {
4510 // Do not produce side effects.
4511 return;
4512 }
4513 CheckFpRegIsValid(reg);
4514 state_->cpu.f[reg] = value;
4515 }
4516
4517 #ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS
4518 template <>
4519 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadIndexedArgs& args);
4520 template <>
4521 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadStrideArgs& args);
4522 template <>
4523 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadUnitStrideArgs& args);
4524 template <>
4525 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVfArgs& args);
4526 template <>
4527 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVvArgs& args);
4528 template <>
4529 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIViArgs& args);
4530 template <>
4531 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVvArgs& args);
4532 template <>
4533 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVxArgs& args);
4534 template <>
4535 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVvArgs& args);
4536 template <>
4537 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVxArgs& args);
4538 template <>
4539 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreIndexedArgs& args);
4540 template <>
4541 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreStrideArgs& args);
4542 template <>
4543 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreUnitStrideArgs& args);
4544 #endif
4545
4546 } // namespace berberis
4547