1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef BERBERIS_INTRINSICS_RISCV64_VECTOR_INTRINSICS_H_
18 #define BERBERIS_INTRINSICS_RISCV64_VECTOR_INTRINSICS_H_
19 
20 #include <algorithm>
21 #include <climits>  // CHAR_BIT
22 #include <cstdint>
23 #include <limits>
24 #include <tuple>
25 #include <type_traits>
26 
27 #include "berberis/base/bit_util.h"
28 #include "berberis/base/dependent_false.h"
29 #include "berberis/intrinsics/intrinsics.h"        // PreferredIntrinsicsImplementation
30 #include "berberis/intrinsics/intrinsics_float.h"  // Float32/Float64
31 #include "berberis/intrinsics/simd_register.h"
32 #include "berberis/intrinsics/type_traits.h"
33 
34 namespace berberis::intrinsics {
35 
36 enum class TailProcessing {
37   kUndisturbed = 0,
38   kAgnostic = 1,
39 };
40 
41 enum class InactiveProcessing {
42   kUndisturbed = 0,
43   kAgnostic = 1,
44 };
45 
46 enum class NoInactiveProcessing {
47   kNoInactiveProcessing = 0,
48 };
49 
50 template <typename ElementType>
FullMaskForRegister(NoInactiveProcessing)51 [[nodiscard]] inline std::tuple<NoInactiveProcessing> FullMaskForRegister(NoInactiveProcessing) {
52   return {NoInactiveProcessing{}};
53 }
54 
55 template <typename ElementType>
56 [[nodiscard]] inline std::tuple<
57     std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
FullMaskForRegister(SIMD128Register)58 FullMaskForRegister(SIMD128Register) {
59   if constexpr (sizeof(ElementType) == sizeof(uint8_t)) {
60     return {{0xffff}};
61   } else if constexpr (sizeof(ElementType) == sizeof(uint16_t)) {
62     return {{0xff}};
63   } else if constexpr (sizeof(ElementType) == sizeof(uint32_t)) {
64     return {{0xf}};
65   } else if constexpr (sizeof(ElementType) == sizeof(uint64_t)) {
66     return {{0x3}};
67   } else {
68     static_assert(kDependentTypeFalse<ElementType>, "Unsupported vector element type");
69   }
70 }
71 
72 template <typename ElementType>
MaskForRegisterInSequence(NoInactiveProcessing,size_t)73 [[nodiscard]] inline std::tuple<NoInactiveProcessing> MaskForRegisterInSequence(
74     NoInactiveProcessing,
75     size_t) {
76   return {NoInactiveProcessing{}};
77 }
78 
79 template <typename ElementType>
80 [[nodiscard]] inline std::tuple<
81     std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
MaskForRegisterInSequence(SIMD128Register mask,size_t register_in_sequence)82 MaskForRegisterInSequence(SIMD128Register mask, size_t register_in_sequence) {
83   if constexpr (sizeof(ElementType) == sizeof(uint8_t)) {
84     return {mask.Get<RawInt16>(register_in_sequence)};
85   } else if constexpr (sizeof(ElementType) == sizeof(uint16_t)) {
86     return {mask.Get<RawInt8>(register_in_sequence)};
87   } else if constexpr (sizeof(ElementType) == sizeof(uint32_t)) {
88     return {RawInt8{TruncateTo<UInt8>(mask.Get<UInt32>(0) >> UInt64(register_in_sequence * 4)) &
89                     UInt8{0b1111}}};
90   } else if constexpr (sizeof(ElementType) == sizeof(uint64_t)) {
91     return {RawInt8{TruncateTo<UInt8>(mask.Get<UInt32>(0) >> UInt64(register_in_sequence * 2)) &
92                     UInt8{0b11}}};
93   } else {
94     static_assert(kDependentTypeFalse<ElementType>, "Unsupported vector element type");
95   }
96 }
97 
98 // Naïve implementation for tests.  Also used on not-x86 platforms.
MakeBitmaskFromVlForTests(size_t vl)99 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVlForTests(size_t vl) {
100   if (vl == 128) {
101     return {SIMD128Register(__int128(0))};
102   } else {
103     return {SIMD128Register((~__int128(0)) << vl)};
104   }
105 }
106 
107 #ifndef __x86_64__
MakeBitmaskFromVl(size_t vl)108 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVl(size_t vl) {
109   return {MakeBitmaskFromVlForTests(vl)};
110 }
111 #endif
112 
113 template <typename ElementType>
MakeBitmaskFromVl(size_t vl)114 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVl(size_t vl) {
115   return MakeBitmaskFromVl(vl * sizeof(ElementType) * CHAR_BIT);
116 }
117 
118 // Naïve implementation for tests.  Also used on not-x86 platforms.
119 template <typename ElementType>
BitMaskToSimdMaskForTests(size_t mask)120 [[nodiscard]] inline std::tuple<SIMD128Register> BitMaskToSimdMaskForTests(size_t mask) {
121   constexpr ElementType kZeroValue = ElementType{0};
122   constexpr ElementType kFillValue = ~ElementType{0};
123   SIMD128Register result;
124   for (size_t index = 0; index < sizeof(SIMD128Register) / sizeof(ElementType); ++index) {
125     size_t bit = 1 << index;
126     if (mask & bit) {
127       result.Set(kFillValue, index);
128     } else {
129       result.Set(kZeroValue, index);
130     }
131   }
132   return {result};
133 }
134 
135 #ifndef __x86_64__
136 template <typename ElementType>
BitMaskToSimdMask(size_t mask)137 [[nodiscard]] inline std::tuple<SIMD128Register> BitMaskToSimdMask(size_t mask) {
138   return {BitMaskToSimdMaskForTests<ElementType>(mask)};
139 }
140 #endif
141 
142 // Naïve implementation for tests.  Also used on not-x86 platforms.
143 template <typename ElementType>
144 [[nodiscard]] inline std::tuple<
145     std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
SimdMaskToBitMaskForTests(SIMD128Register simd_mask)146 SimdMaskToBitMaskForTests(SIMD128Register simd_mask) {
147   using ResultType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
148   ResultType mask{0};
149   constexpr ResultType kElementsCount{
150       static_cast<uint8_t>(sizeof(SIMD128Register) / sizeof(ElementType))};
151   for (ResultType index{0}; index < kElementsCount; index += ResultType{1}) {
152     if (simd_mask.Get<ElementType>(index) != ElementType{}) {
153       mask |= ResultType{1} << ResultType{index};
154     }
155   }
156   return mask;
157 }
158 
159 #ifndef __SSSE3__
160 template <typename ElementType>
161 [[nodiscard]] inline std::tuple<
162     std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
SimdMaskToBitMask(SIMD128Register simd_mask)163 SimdMaskToBitMask(SIMD128Register simd_mask) {
164   return SimdMaskToBitMaskForTests<ElementType>(simd_mask);
165 }
166 #endif
167 
168 template <auto kElement>
VectorMaskedElementToForTests(SIMD128Register simd_mask,SIMD128Register result)169 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMaskedElementToForTests(
170     SIMD128Register simd_mask,
171     SIMD128Register result) {
172   using ElementType = decltype(kElement);
173   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
174   for (size_t index = 0; index < kElementsCount; ++index) {
175     if (!simd_mask.Get<ElementType>(index)) {
176       result.Set(kElement, index);
177     }
178   }
179   return result;
180 }
181 
182 #ifndef __x86_64__
183 template <typename ElementType>
VectorMaskedElementTo(SIMD128Register simd_mask,SIMD128Register result)184 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMaskedElementTo(SIMD128Register simd_mask,
185                                                                        SIMD128Register result) {
186   return VectorMaskedElementToForTests(simd_mask, result);
187 }
188 #endif
189 
190 template <typename ElementType>
VectorElement(SIMD128Register src,int index)191 [[nodiscard]] inline ElementType VectorElement(SIMD128Register src, int index) {
192   return src.Get<ElementType>(index);
193 }
194 
195 template <typename ElementType>
VectorElement(ElementType src,int)196 [[nodiscard]] inline ElementType VectorElement(ElementType src, int) {
197   return src;
198 }
199 
200 template <typename ElementType>
VMovTopHalfToBottom(SIMD128Register src)201 [[nodiscard]] inline std::tuple<SIMD128Register> VMovTopHalfToBottom(SIMD128Register src) {
202   return {SIMD128Register{src.Get<uint64_t>(1)}};
203 }
204 
205 template <typename ElementType>
VMergeBottomHalfToTop(SIMD128Register bottom,SIMD128Register top)206 [[nodiscard]] inline std::tuple<SIMD128Register> VMergeBottomHalfToTop(SIMD128Register bottom,
207                                                                        SIMD128Register top) {
208   SIMD128Register result{bottom};
209   result.Set<uint64_t>(top.Get<uint64_t>(0), 1);
210   return result;
211 }
212 
213 // Naïve implementation for tests.  Also used on not-x86 platforms.
214 template <auto kDefaultElement>
VectorBroadcastForTests()215 [[nodiscard]] inline std::tuple<SIMD128Register> VectorBroadcastForTests() {
216   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof kDefaultElement;
217   SIMD128Register dest;
218   for (size_t index = 0; index < kElementsCount; ++index) {
219     dest.Set(kDefaultElement, index);
220   }
221   return dest;
222 }
223 
224 #ifndef __x86_64__
225 template <auto kDefaultElement>
VectorBroadcast()226 [[nodiscard]] inline std::tuple<SIMD128Register> VectorBroadcast() {
227   return VectorBroadcastForTests<kDefaultElement>();
228 }
229 #endif
230 
231 template <auto kDefaultElement, TailProcessing vta, NoInactiveProcessing = NoInactiveProcessing{}>
VectorMasking(SIMD128Register result,int vstart,int vl)232 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register result,
233                                                                int vstart,
234                                                                int vl) {
235   constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof kDefaultElement);
236   if (vstart < 0) {
237     vstart = 0;
238   }
239   if (vl < 0) {
240     vl = 0;
241   }
242   if (vl > kElementsCount) {
243     vl = kElementsCount;
244   }
245   if constexpr (kDefaultElement == decltype(kDefaultElement){}) {
246     if (vstart == 0) [[likely]] {
247       if (vl != kElementsCount) [[unlikely]] {
248         const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
249         result &= ~tail_bitmask;
250       }
251     } else if (vstart >= vl) [[unlikely]] {
252       // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
253       // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
254       // kElementsCount we hit that corner case and return zero if that happens.
255       result = SIMD128Register{};
256     } else {
257       // Note: vstart < vl here because RISC-V instructions don't alter the result if vstart >= vl.
258       CHECK_LT(vstart, vl);
259       const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
260       const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
261       result &= start_bitmask;
262       result &= ~tail_bitmask;
263     }
264   } else if constexpr (kDefaultElement == ~decltype(kDefaultElement){}) {
265     if (vstart == 0) [[likely]] {
266       if (vl != kElementsCount) [[unlikely]] {
267         const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
268         result |= tail_bitmask;
269       }
270     } else if (vstart >= vl) [[unlikely]] {
271       // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
272       // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
273       // kElementsCount we hit that corner case and return zero if that happens.
274       result = ~SIMD128Register{};
275     } else {
276       // Note: vstart < vl here because RISC-V instructions don't alter the result if vstart >= vl.
277       CHECK_LT(vstart, vl);
278       const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
279       const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
280       result |= ~start_bitmask;
281       result |= tail_bitmask;
282     }
283   } else {
284     const std::tuple<SIMD128Register>& dest = VectorBroadcast<kDefaultElement>();
285     if (vstart == 0) [[likely]] {
286       if (vl != kElementsCount) [[unlikely]] {
287         const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
288         result &= ~tail_bitmask;
289         result |= (std::get<0>(dest) & tail_bitmask);
290       }
291     } else if (vstart >= vl) [[unlikely]] {
292       // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
293       // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
294       // kElementsCount we hit that corner case and return dest if that happens.
295       result = std::get<0>(dest);
296     } else {
297       const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
298       const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
299       result &= start_bitmask;
300       result &= ~tail_bitmask;
301       result |= (std::get<0>(dest) & (~start_bitmask | tail_bitmask));
302     }
303   }
304   return result;
305 }
306 
307 template <auto kDefaultElement,
308           TailProcessing vta,
309           auto vma = NoInactiveProcessing{},
310           typename MaskType>
311 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register result,
312                                                                int vstart,
313                                                                int vl,
314                                                                MaskType mask) {
315   static_assert((std::is_same_v<decltype(vma), NoInactiveProcessing> &&
316                  std::is_same_v<MaskType, NoInactiveProcessing>) ||
317                 (std::is_same_v<decltype(vma), InactiveProcessing> &&
318                  (std::is_same_v<MaskType, RawInt8> || std::is_same_v<MaskType, RawInt16>)));
319   if constexpr (std::is_same_v<decltype(vma), InactiveProcessing>) {
320     const auto [simd_mask] = BitMaskToSimdMask<decltype(kDefaultElement)>(
321         static_cast<typename MaskType::BaseType>(mask));
322     if constexpr (kDefaultElement == ~decltype(kDefaultElement){}) {
323       result |= ~simd_mask;
324     } else {
325       result &= simd_mask;
326       if constexpr (kDefaultElement != decltype(kDefaultElement){}) {
327         const std::tuple<SIMD128Register>& dest = VectorBroadcast<kDefaultElement>();
328         result |= std::get<0>(dest) & ~simd_mask;
329       }
330     }
331   }
332   return VectorMasking<kDefaultElement, vta>(result, vstart, vl);
333 }
334 
335 template <typename ElementType, TailProcessing vta, NoInactiveProcessing = NoInactiveProcessing{}>
336 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(
337     SIMD128Register dest,
338     SIMD128Register result,
339     int vstart,
340     int vl,
341     NoInactiveProcessing /*mask*/ = NoInactiveProcessing{}) {
342   constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(ElementType));
343   if (vstart < 0) {
344     vstart = 0;
345   }
346   if (vl < 0) {
347     vl = 0;
348   }
349   if (vl > kElementsCount) {
350     vl = kElementsCount;
351   }
352   if (vstart == 0) [[likely]] {
353     if (vl == kElementsCount) [[likely]] {
354       return result;
355     }
356     const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
357     if constexpr (vta == TailProcessing::kAgnostic) {
358       dest = result | tail_bitmask;
359     } else {
360       dest = (dest & tail_bitmask) | (result & ~tail_bitmask);
361     }
362   } else if (vstart < vl) [[likely]] {
363     // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
364     // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
365     // kElementsCount we hit that corner case and return dest if that happens.
366     const auto [start_bitmask] = MakeBitmaskFromVl<ElementType>(vstart);
367     const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
368     if constexpr (vta == TailProcessing::kAgnostic) {
369       dest = (dest & ~start_bitmask) | (result & start_bitmask) | tail_bitmask;
370     } else {
371       dest = (dest & (~start_bitmask | tail_bitmask)) | (result & start_bitmask & ~tail_bitmask);
372     }
373   } else if constexpr (vta == TailProcessing::kAgnostic) {
374     if (vstart == vl) {
375       // Corners case where vstart == vl may happen because of vslideup:
376       //   https://github.com/riscv/riscv-v-spec/issues/263
377       const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
378       dest |= tail_bitmask;
379     }
380   }
381   return {dest};
382 }
383 
384 template <typename ElementType,
385           TailProcessing vta,
386           auto vma = NoInactiveProcessing{},
387           typename MaskType = NoInactiveProcessing>
388 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(
389     SIMD128Register dest,
390     SIMD128Register result,
391     SIMD128Register result_mask,
392     int vstart,
393     int vl,
394     MaskType mask = NoInactiveProcessing{}) {
395   static_assert((std::is_same_v<decltype(vma), NoInactiveProcessing> &&
396                  std::is_same_v<MaskType, NoInactiveProcessing>) ||
397                 (std::is_same_v<decltype(vma), InactiveProcessing> &&
398                  (std::is_same_v<MaskType, RawInt8> || std::is_same_v<MaskType, RawInt16>)));
399   if constexpr (std::is_same_v<decltype(vma), InactiveProcessing>) {
400     const auto [simd_mask] =
401         BitMaskToSimdMask<ElementType>(static_cast<typename MaskType::BaseType>(mask));
402     if (vma == InactiveProcessing::kAgnostic) {
403       result |= ~simd_mask;
404     } else {
405       result = (result & simd_mask) | (result_mask & ~simd_mask);
406     }
407   }
408   return VectorMasking<ElementType, vta>(dest, result, vstart, vl);
409 }
410 
411 template <typename ElementType, TailProcessing vta, InactiveProcessing vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,int vstart,int vl,MaskType mask)412 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register dest,
413                                                                SIMD128Register result,
414                                                                int vstart,
415                                                                int vl,
416                                                                MaskType mask) {
417   return VectorMasking<ElementType, vta, vma>(dest,
418                                               result,
419                                               /*result_mask=*/dest,
420                                               vstart,
421                                               vl,
422                                               mask);
423 }
424 
425 template <typename ElementType, typename... ParameterType>
426 inline constexpr bool kIsAllowedArgumentForVector =
427     ((std::is_same_v<ParameterType, SIMD128Register> ||
428       std::is_same_v<ParameterType, ElementType>)&&...);
429 
430 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
431 template <typename ElementType, typename Lambda, typename... ParameterType>
VectorProcessing(Lambda lambda,ParameterType...parameters)432 inline std::tuple<SIMD128Register> VectorProcessing(Lambda lambda, ParameterType... parameters) {
433   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
434   SIMD128Register result;
435   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
436   for (size_t index = 0; index < kElementsCount; ++index) {
437     result.Set(lambda(VectorElement<ElementType>(parameters, index)...), index);
438   }
439   return result;
440 }
441 
442 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
443 template <typename ElementType, typename Lambda, typename ResultType, typename... ParameterType>
VectorProcessingReduce(Lambda lambda,ResultType init,ParameterType...parameters)444 inline std::tuple<ResultType> VectorProcessingReduce(Lambda lambda,
445                                                      ResultType init,
446                                                      ParameterType... parameters) {
447   static_assert(std::is_same_v<ResultType, ElementType> ||
448                 std::is_same_v<ResultType, WideType<ElementType>>);
449   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
450   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
451   for (size_t index = 0; index < kElementsCount; ++index) {
452     init = lambda(init, VectorElement<ElementType>(parameters, index)...);
453   }
454   return init;
455 }
456 
457 // SEW = 2*SEW op SEW
458 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
459 template <typename ElementType, typename Lambda, typename ParameterType1, typename ParameterType2>
VectorArithmeticNarrowwv(Lambda lambda,ParameterType1 src1,ParameterType2 src2)460 inline std::tuple<SIMD128Register> VectorArithmeticNarrowwv(Lambda lambda,
461                                                             ParameterType1 src1,
462                                                             ParameterType2 src2) {
463   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2>);
464   SIMD128Register result;
465   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
466   for (size_t index = 0; index < kElementsCount; ++index) {
467     result.Set(Narrow(lambda(VectorElement<WideType<ElementType>>(src1, index),
468                              Widen(VectorElement<ElementType>(src2, index)))),
469                index);
470   }
471   return result;
472 }
473 
474 // 2*SEW = SEW op SEW
475 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
476 template <typename ElementType, typename Lambda, typename... ParameterType>
VectorArithmeticWidenvv(Lambda lambda,ParameterType...parameters)477 inline std::tuple<SIMD128Register> VectorArithmeticWidenvv(Lambda lambda,
478                                                            ParameterType... parameters) {
479   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
480   SIMD128Register result;
481   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
482   for (size_t index = 0; index < kElementsCount; ++index) {
483     result.Set(lambda(Widen(VectorElement<ElementType>(parameters, index))...), index);
484   }
485   return result;
486 }
487 
488 // 2*SEW = SEW op SEW op 2*SEW
489 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
490 template <typename ElementType,
491           typename Lambda,
492           typename ParameterType1,
493           typename ParameterType2,
494           typename ParameterType3>
VectorArithmeticWidenvvw(Lambda lambda,ParameterType1 src1,ParameterType2 src2,ParameterType3 src3)495 inline std::tuple<SIMD128Register> VectorArithmeticWidenvvw(Lambda lambda,
496                                                             ParameterType1 src1,
497                                                             ParameterType2 src2,
498                                                             ParameterType3 src3) {
499   static_assert(
500       kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2, ParameterType3>);
501   SIMD128Register result;
502   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
503   for (size_t index = 0; index < kElementsCount; ++index) {
504     result.Set(lambda(Widen(VectorElement<ElementType>(src1, index)),
505                       Widen(VectorElement<ElementType>(src2, index)),
506                       VectorElement<WideType<ElementType>>(src3, index)),
507                index);
508   }
509   return result;
510 }
511 
512 // SEW = 2*SEW op SEW
513 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
514 template <typename ElementType, typename Lambda, typename ParameterType1, typename ParameterType2>
VectorArithmeticWidenwv(Lambda lambda,ParameterType1 src1,ParameterType2 src2)515 inline std::tuple<SIMD128Register> VectorArithmeticWidenwv(Lambda lambda,
516                                                            ParameterType1 src1,
517                                                            ParameterType2 src2) {
518   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2>);
519   SIMD128Register result;
520   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
521   for (size_t index = 0; index < kElementsCount; ++index) {
522     result.Set(lambda(VectorElement<WideType<ElementType>>(src1, index),
523                       Widen(VectorElement<ElementType>(src2, index))),
524                index);
525   }
526   return result;
527 }
528 
529 template <typename ElementType>
VectorExtend(SIMD128Register src)530 SIMD128Register VectorExtend(SIMD128Register src) {
531   SIMD128Register result;
532   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
533   for (size_t index = 0; index < kElementsCount; ++index) {
534     result.Set(Widen(VectorElement<ElementType>(src, index)), index);
535   }
536   return result;
537 }
538 
539 template <typename ElementType,
540           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf2(SIMD128Register src)541 inline std::tuple<SIMD128Register> Vextf2(SIMD128Register src) {
542   using SourceElementType = NarrowType<ElementType>;
543   return {VectorExtend<SourceElementType>(src)};
544 }
545 
546 template <typename ElementType,
547           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf4(SIMD128Register src)548 inline std::tuple<SIMD128Register> Vextf4(SIMD128Register src) {
549   using WideSourceElementType = NarrowType<ElementType>;
550   using SourceElementType = NarrowType<WideSourceElementType>;
551   return {VectorExtend<WideSourceElementType>(VectorExtend<SourceElementType>(src))};
552 }
553 
554 template <typename ElementType,
555           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf8(SIMD128Register src)556 inline std::tuple<SIMD128Register> Vextf8(SIMD128Register src) {
557   using WideWideSourceElementType = NarrowType<ElementType>;
558   return {
559       VectorExtend<WideWideSourceElementType>(std::get<0>(Vextf4<WideWideSourceElementType>(src)))};
560 }
561 
562 template <typename ElementType,
563           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
VidvForTests(size_t index)564 inline std::tuple<SIMD128Register> VidvForTests(size_t index) {
565   SIMD128Register result;
566   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
567   ElementType element = {static_cast<typename ElementType::BaseType>(index * kElementsCount)};
568   for (size_t index = 0; index < kElementsCount; ++index) {
569     result.Set(element, index);
570     element += ElementType{1};
571   }
572   return result;
573 }
574 
575 // Handles "slide up" for a single destination register. Effectively copies the last offset elements
576 // in [kElementsCount - offset, kElementsCount) of src1 followed by the first [0, kElementsCount -
577 // offset) elements of src2 into the result.
578 //
579 // This leaves result looking like
580 //
581 //     result = {
582 //         src1[kElementsCount-offset+0],
583 //         src1[kElementsCount-offset+1],
584 //         ...,
585 //         src1[kElementsCount-offset+(offset-1),
586 //         src2[0],
587 //         src2[1],
588 //         ...,
589 //         src2[kElementsCount-offset-1]
590 //     };
591 template <typename ElementType>
VectorSlideUp(size_t offset,SIMD128Register src1,SIMD128Register src2)592 inline std::tuple<SIMD128Register> VectorSlideUp(size_t offset,
593                                                  SIMD128Register src1,
594                                                  SIMD128Register src2) {
595   SIMD128Register result;
596   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
597   CHECK_LT(offset, kElementsCount);
598   for (size_t index = 0; index < offset; ++index) {
599     result.Set(VectorElement<ElementType>(src1, kElementsCount - offset + index), index);
600   }
601   for (size_t index = offset; index < kElementsCount; ++index) {
602     result.Set(VectorElement<ElementType>(src2, index - offset), index);
603   }
604   return result;
605 }
606 
607 // Handles "slide down" for a single destination register. Effectively copies the elements in
608 // [offset, kElementsCount) of src1 followed by the [0, kElementsCount - offset) elements of src2
609 // into the result.
610 //
611 // This leaves result looking like
612 //
613 //     result = {
614 //         [0] = src1[offset+0],
615 //         [1] = src1[offset+1],
616 //         ...,
617 //         [kElementsCount-offset-1] = src1[kElementsCount-1],
618 //         [kElementsCount-offset] = src2[0],
619 //         [kElementsCount-offset+1] = src2[1],
620 //         ...,
621 //         [kElementsCount-offset+(offset-1)] = src2[kElementsCount-offset-1]
622 //     };
623 template <typename ElementType>
VectorSlideDown(size_t offset,SIMD128Register src1,SIMD128Register src2)624 inline std::tuple<SIMD128Register> VectorSlideDown(size_t offset,
625                                                    SIMD128Register src1,
626                                                    SIMD128Register src2) {
627   SIMD128Register result;
628   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
629   CHECK_LT(offset, kElementsCount);
630   for (size_t index = 0; index < kElementsCount - offset; ++index) {
631     result.Set(VectorElement<ElementType>(src1, offset + index), index);
632   }
633   for (size_t index = kElementsCount - offset; index < kElementsCount; ++index) {
634     result.Set(VectorElement<ElementType>(src2, index - (kElementsCount - offset)), index);
635   }
636   return result;
637 }
638 
639 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vcpopm(SIMD128Register simd_src)640 inline std::tuple<SIMD128Register> Vcpopm(SIMD128Register simd_src) {
641   UInt128 src = simd_src.Get<UInt128>();
642   return Popcount(src);
643 }
644 
645 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vfirstm(SIMD128Register simd_src)646 inline std::tuple<SIMD128Register> Vfirstm(SIMD128Register simd_src) {
647   UInt128 src = simd_src.Get<UInt128>();
648   if (src == Int128{0}) {
649     return ~UInt128{0};
650   }
651   return CountRZero(src);
652 }
653 
654 #ifndef __x86_64__
655 template <typename ElementType,
656           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vidv(size_t index)657 inline std::tuple<SIMD128Register> Vidv(size_t index) {
658   return VidvForTests<ElementType>(index);
659 }
660 #endif
661 
662 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsifm(SIMD128Register simd_src)663 inline std::tuple<SIMD128Register> Vmsifm(SIMD128Register simd_src) {
664   Int128 src = simd_src.Get<Int128>();
665   return {(src - Int128{1}) ^ src};
666 }
667 
668 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsbfm(SIMD128Register simd_src)669 inline std::tuple<SIMD128Register> Vmsbfm(SIMD128Register simd_src) {
670   Int128 src = simd_src.Get<Int128>();
671   if (src == Int128{0}) {
672     return {~Int128{0}};
673   }
674   return {std::get<0>(Vmsifm(simd_src)).Get<Int128>() >> Int128{1}};
675 }
676 
677 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsofm(SIMD128Register simd_src)678 inline std::tuple<SIMD128Register> Vmsofm(SIMD128Register simd_src) {
679   return {std::get<0>(Vmsbfm(simd_src)) ^ std::get<0>(Vmsifm(simd_src))};
680 }
681 
682 template <typename ElementType,
683           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Viotam(SIMD128Register simd_src,size_t counter)684 inline std::tuple<SIMD128Register, size_t> Viotam(SIMD128Register simd_src, size_t counter) {
685   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
686   __uint128_t src = simd_src.Get<__uint128_t>();
687   SIMD128Register result;
688   for (size_t index = 0; index < kElementsCount; ++index) {
689     typename Wrapping<typename ElementType::BaseType>::UnsignedType value{
690         static_cast<typename ElementType::BaseType>(counter)};
691     result.Set(value, index);
692     counter += static_cast<size_t>(src & 1);
693     src >>= 1;
694   }
695   return {result, counter};
696 }
697 
698 template <typename TargetElementType,
699           typename SourceElementType,
700           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vfcvtv(int8_t rm,int8_t frm,SIMD128Register src)701 inline std::tuple<SIMD128Register> Vfcvtv(int8_t rm, int8_t frm, SIMD128Register src) {
702   SIMD128Register result;
703   size_t kElementsCount = std::min(sizeof(SIMD128Register) / sizeof(TargetElementType),
704                                    sizeof(SIMD128Register) / sizeof(SourceElementType));
705   for (size_t index = 0; index < kElementsCount; ++index) {
706     if constexpr (!std::is_same_v<TargetElementType, Float16> &&
707                   !std::is_same_v<TargetElementType, Float32> &&
708                   !std::is_same_v<TargetElementType, Float64>) {
709       result.Set(
710           std::get<0>(FCvtFloatToInteger<typename TargetElementType::BaseType, SourceElementType>(
711               rm, frm, src.Get<SourceElementType>(index))),
712           index);
713     } else if constexpr (!std::is_same_v<SourceElementType, Float16> &&
714                          !std::is_same_v<SourceElementType, Float32> &&
715                          !std::is_same_v<SourceElementType, Float64>) {
716       result.Set(
717           std::get<0>(FCvtIntegerToFloat<TargetElementType, typename SourceElementType::BaseType>(
718               rm, frm, src.Get<typename SourceElementType::BaseType>(index))),
719           index);
720     } else {
721       result.Set(std::get<0>(FCvtFloatToFloat<TargetElementType, SourceElementType>(
722                      rm, frm, src.Get<SourceElementType>(index))),
723                  index);
724     }
725   }
726   return result;
727 }
728 
729 // With wide intrinsics multiplication we may do sign-extension or zero-extension, but some
730 // intrinsics need mix: Signed * Unsigned. We narrow down value and then extend it again.
731 // Compiler is smart enough to eliminate dead code.
732 template <typename ElementType>
WideMultiplySignedUnsigned(ElementType arg1,ElementType arg2)733 std::tuple<ElementType> WideMultiplySignedUnsigned(ElementType arg1, ElementType arg2) {
734   return BitCastToUnsigned(Widen(BitCastToSigned(Narrow(arg1)))) *
735          Widen(BitCastToUnsigned(Narrow(arg2)));
736 }
737 
738 #define DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS(...) __VA_ARGS__
739 #define DEFINE_ARITHMETIC_INTRINSIC(Name, arithmetic, parameters, capture, arguments)             \
740   template <typename ElementType,                                                                 \
741             enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>       \
742   inline std::tuple<SIMD128Register> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
743     return VectorProcessing<ElementType>(                                                         \
744         [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) {                       \
745           static_assert((std::is_same_v<decltype(args), ElementType> && ...));                    \
746           arithmetic;                                                                             \
747         },                                                                                        \
748         DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments);                                     \
749   }
750 
751 #define DEFINE_1OP_ARITHMETIC_INTRINSIC_V(name, ...)                 \
752   DEFINE_ARITHMETIC_INTRINSIC(V##name##v, return ({ __VA_ARGS__; }); \
753                               , (SIMD128Register src), (), (src))
754 
755 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(name, ...)                 \
756   DEFINE_ARITHMETIC_INTRINSIC(V##name##vv, return ({ __VA_ARGS__; }); \
757                               , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
758 
759 #define DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(name, ...)                                             \
760   DEFINE_ARITHMETIC_INTRINSIC(V##name##vv, return ({ __VA_ARGS__; });                             \
761                               ,                                                                   \
762                               (SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
763                               (),                                                                 \
764                               (src1, src2, src3))
765 
766 #define DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(name, ...)                            \
767   DEFINE_ARITHMETIC_INTRINSIC(                                                        \
768       V##name##vv, return ({ __VA_ARGS__; });                                         \
769       ,                                                                               \
770       (int8_t csr, SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
771       (csr),                                                                          \
772       (src1, src2, src3))
773 
774 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(name, ...)                 \
775   DEFINE_ARITHMETIC_INTRINSIC(V##name##vx, return ({ __VA_ARGS__; }); \
776                               , (SIMD128Register src1, ElementType src2), (), (src1, src2))
777 
778 #define DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(name, ...) \
779   DEFINE_ARITHMETIC_INTRINSIC(                        \
780       V##name##vx, return ({ __VA_ARGS__; });         \
781       , (SIMD128Register src1, ElementType src2, SIMD128Register src3), (), (src1, src2, src3))
782 
783 #define DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(name, ...)                        \
784   DEFINE_ARITHMETIC_INTRINSIC(                                                    \
785       V##name##vf, return ({ __VA_ARGS__; });                                     \
786       ,                                                                           \
787       (int8_t csr, SIMD128Register src1, ElementType src2, SIMD128Register src3), \
788       (csr),                                                                      \
789       (src1, src2, src3))
790 
791 #define DEFINE_1OP_ARITHMETIC_INTRINSIC_X(name, ...) \
792   DEFINE_ARITHMETIC_INTRINSIC(V##name##x, return ({ __VA_ARGS__; });, (ElementType src), (), (src))
793 
794 #define DEFINE_1OP_1CSR_ARITHMETIC_INTRINSIC_V(name, ...)            \
795   DEFINE_ARITHMETIC_INTRINSIC(V##name##v, return ({ __VA_ARGS__; }); \
796                               , (int8_t csr, SIMD128Register src), (csr), (src))
797 
798 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(name, ...) \
799   DEFINE_ARITHMETIC_INTRINSIC(                             \
800       V##name##vf, return ({ __VA_ARGS__; });              \
801       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
802 
803 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(name, ...) \
804   DEFINE_ARITHMETIC_INTRINSIC(                             \
805       V##name##vv, return ({ __VA_ARGS__; });              \
806       , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
807 
808 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(name, ...) \
809   DEFINE_ARITHMETIC_INTRINSIC(                             \
810       V##name##vx, return ({ __VA_ARGS__; });              \
811       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
812 
813 #define DEFINE_ARITHMETIC_REDUCE_INTRINSIC(Name, arithmetic, parameters, capture, arguments) \
814   template <typename ElementType,                                                            \
815             typename ResultType = ElementType,                                               \
816             enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>  \
817   inline std::tuple<ResultType> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
818     return VectorProcessingReduce<ElementType>(                                              \
819         [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) {                  \
820           static_assert((std::is_same_v<decltype(args), ElementType> && ...));               \
821           arithmetic;                                                                        \
822         },                                                                                   \
823         DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments);                                \
824   }
825 
826 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(name, ...)                        \
827   DEFINE_ARITHMETIC_REDUCE_INTRINSIC(V##name##vs, return ({ __VA_ARGS__; }); \
828                                      , (ResultType init, SIMD128Register src), (), (init, src))
829 
830 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(name, ...) \
831   DEFINE_ARITHMETIC_REDUCE_INTRINSIC(                      \
832       Vfred##name##vs, return ({ __VA_ARGS__; });          \
833       , (int8_t csr, ResultType init, SIMD128Register src), (csr), (init, src))
834 
835 #define DEFINE_W_ARITHMETIC_INTRINSIC(Name, Pattern, arithmetic, parameters, capture, arguments)  \
836   template <typename ElementType,                                                                 \
837             enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>       \
838   inline std::tuple<SIMD128Register> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
839     return VectorArithmetic##Pattern<ElementType>(                                                \
840         [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) {                       \
841           static_assert((std::is_same_v<decltype(args), WideType<ElementType>> && ...));          \
842           arithmetic;                                                                             \
843         },                                                                                        \
844         DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments);                                     \
845   }
846 
847 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...)                       \
848   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
849                                 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
850 
851 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...) \
852   DEFINE_W_ARITHMETIC_INTRINSIC(                                 \
853       Vfw##name##vv, Widenvv, return ({ __VA_ARGS__; });         \
854       , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
855 
856 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(name, ...) \
857   DEFINE_W_ARITHMETIC_INTRINSIC(                                 \
858       Vfw##name##vf, Widenvv, return ({ __VA_ARGS__; });         \
859       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
860 
861 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(name, ...) \
862   DEFINE_W_ARITHMETIC_INTRINSIC(                                 \
863       Vfw##name##wv, Widenwv, return ({ __VA_ARGS__; });         \
864       , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
865 
866 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(name, ...) \
867   DEFINE_W_ARITHMETIC_INTRINSIC(                                 \
868       Vfw##name##wf, Widenwv, return ({ __VA_ARGS__; });         \
869       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
870 
871 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(name, ...)              \
872   DEFINE_W_ARITHMETIC_INTRINSIC(                                          \
873       Vw##name##vv, Widenvvw, return ({ __VA_ARGS__; });                  \
874       ,                                                                   \
875       (SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
876       (),                                                                 \
877       (src1, src2, src3))
878 
879 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(name, ...)                       \
880   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vx, Widenvv, return ({ __VA_ARGS__; }); \
881                                 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
882 
883 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(name, ...) \
884   DEFINE_W_ARITHMETIC_INTRINSIC(                             \
885       Vw##name##vx, Widenvvw, return ({ __VA_ARGS__; });     \
886       , (SIMD128Register src1, ElementType src2, SIMD128Register src3), (), (src1, src2, src3))
887 
888 #define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(name, ...)                     \
889   DEFINE_W_ARITHMETIC_INTRINSIC(                                                      \
890       Vfw##name##vv, Widenvvw, return ({ __VA_ARGS__; });                             \
891       ,                                                                               \
892       (int8_t csr, SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
893       (csr),                                                                          \
894       (src1, src2, src3))
895 
896 #define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(name, ...)                 \
897   DEFINE_W_ARITHMETIC_INTRINSIC(                                                  \
898       Vfw##name##vf, Widenvvw, return ({ __VA_ARGS__; });                         \
899       ,                                                                           \
900       (int8_t csr, SIMD128Register src1, ElementType src2, SIMD128Register src3), \
901       (csr),                                                                      \
902       (src1, src2, src3))
903 
904 #define DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...)                       \
905   DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; }); \
906                                 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
907 
908 #define DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...)                       \
909   DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; }); \
910                                 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
911 
912 #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...) \
913   DEFINE_W_ARITHMETIC_INTRINSIC(                                  \
914       Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; });          \
915       , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
916 
917 #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...) \
918   DEFINE_W_ARITHMETIC_INTRINSIC(                                  \
919       Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; });          \
920       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
921 
922 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...)                       \
923   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
924                                 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
925 
926 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(name, ...)                       \
927   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##wv, Widenwv, return ({ __VA_ARGS__; }); \
928                                 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
929 
930 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(name, ...)                       \
931   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##wx, Widenwv, return ({ __VA_ARGS__; }); \
932                                 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
933 
934 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(name, ...)                       \
935   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vx, Widenvv, return ({ __VA_ARGS__; }); \
936                                 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
937 
938 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(copy, auto [arg] = std::tuple{args...}; arg)
939 DEFINE_1OP_ARITHMETIC_INTRINSIC_X(copy, auto [arg] = std::tuple{args...}; arg)
940 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(frsqrt7, RSqrtEstimate(args...))
941 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(
942     fclass,
943     static_cast<typename TypeTraits<ElementType>::Int>(std::get<0>(FClass(args...))))
944 
945 DEFINE_1OP_1CSR_ARITHMETIC_INTRINSIC_V(fsqrt,
946                                        CanonicalizeNanTuple(FSqrt(FPFlags::DYN, csr, args...)))
947 
948 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(add, (args + ...))
949 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(add, (args + ...))
950 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redsum, (args + ...))
951 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(rsub, auto [arg1, arg2] = std::tuple{args...}; (arg2 - arg1))
952 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sub, (args - ...))
953 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sub, (args - ...))
954 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(and, (args & ...))
955 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(and, (args & ...))
956 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redand, (args & ...))
957 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(or, (args | ...))
958 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(or, (args | ...))
959 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redor, (args | ...))
960 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(xor, (args ^ ...))
961 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(xor, (args ^ ...))
962 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redxor, (args ^ ...))
963 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
964     aadd,
965     ElementType{std::get<0>(Aadd(csr, static_cast<typename ElementType::BaseType>(args)...))})
966 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
967     aadd,
968     ElementType{std::get<0>(Aadd(csr, static_cast<typename ElementType::BaseType>(args)...))})
969 
970 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(smul, auto [arg1, arg2] = std::tuple{args...}; ElementType{
971     Narrow(Saturating{std::get<0>(Roundoff(
972         csr,
973         static_cast<typename WideType<ElementType>::BaseType>(Widen(arg1) * Widen(arg2)),
974         static_cast<typename WideType<ElementType>::BaseType>((sizeof(ElementType) * CHAR_BIT) -
975                                                               1)))})})
976 
977 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(smul, auto [arg1, arg2] = std::tuple{args...}; ElementType{
978     Narrow(Saturating{std::get<0>(Roundoff(
979         csr,
980         static_cast<typename WideType<ElementType>::BaseType>(Widen(arg1) * Widen(arg2)),
981         static_cast<typename WideType<ElementType>::BaseType>((sizeof(ElementType) * CHAR_BIT) -
982                                                               1)))})})
983 
984 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
985     ssr,
986     ElementType{std::get<0>(Roundoff(csr, static_cast<typename ElementType::BaseType>(args)...))})
987 
988 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
989     ssr,
990     ElementType{std::get<0>(Roundoff(csr, static_cast<typename ElementType::BaseType>(args)...))})
991 
992 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fadd,
993                                         CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
994 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fadd,
995                                         CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
996 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
997     add,
998     CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
999 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1000     add,
1001     CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1002 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1003     sub,
1004     CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1005 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1006     sub,
1007     CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1008 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1009     mul,
1010     CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1011 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1012     mul,
1013     CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1014 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(
1015     add,
1016     CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1017 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(
1018     add,
1019     CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1020 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(
1021     sub,
1022     CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1023 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(
1024     sub,
1025     CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1026 
1027 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fsub,
1028                                         CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1029 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fsub,
1030                                         CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1031 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(frsub, auto [arg1, arg2] = std::tuple{args...};
1032                                         CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, arg2, arg1)))
1033 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(osum,
1034                                         CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1035 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(usum,
1036                                         CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1037 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
1038     asub,
1039     ElementType{std::get<0>(Asub(csr, static_cast<typename ElementType::BaseType>(args)...))})
1040 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
1041     asub,
1042     ElementType{std::get<0>(Asub(csr, static_cast<typename ElementType::BaseType>(args)...))})
1043 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmul,
1044                                         CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1045 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmul,
1046                                         CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1047 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fdiv,
1048                                         CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, args...)))
1049 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fdiv,
1050                                         CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, args...)))
1051 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(frdiv, auto [arg1, arg2] = std::tuple{args...};
1052                                         CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, arg2, arg1)))
1053 // SIMD mask either includes results with all bits set to 0 or all bits set to 1.
1054 // This way it may be used with VAnd and VAndN operations to perform masking.
1055 // Such comparison is effectively one instruction of x86-64 (via SSE or AVX) but
1056 // to achieve it we need to multiply bool result by (~IntType{0}) or (~ElementType{0}).
1057 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(feq, using IntType = typename TypeTraits<ElementType>::Int;
1058                                    (~IntType{0}) * IntType(std::get<0>(Feq(args...))))
1059 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(feq, using IntType = typename TypeTraits<ElementType>::Int;
1060                                    (~IntType{0}) * IntType(std::get<0>(Feq(args...))))
1061 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fne, using IntType = typename TypeTraits<ElementType>::Int;
1062                                    (~IntType{0}) * IntType(!std::get<0>(Feq(args...))))
1063 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fne, using IntType = typename TypeTraits<ElementType>::Int;
1064                                    (~IntType{0}) * IntType(!std::get<0>(Feq(args...))))
1065 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(flt, using IntType = typename TypeTraits<ElementType>::Int;
1066                                    (~IntType{0}) * IntType(std::get<0>(Flt(args...))))
1067 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(flt, using IntType = typename TypeTraits<ElementType>::Int;
1068                                    (~IntType{0}) * IntType(std::get<0>(Flt(args...))))
1069 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fle, using IntType = typename TypeTraits<ElementType>::Int;
1070                                    (~IntType{0}) * IntType(std::get<0>(Fle(args...))))
1071 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fle, using IntType = typename TypeTraits<ElementType>::Int;
1072                                    (~IntType{0}) * IntType(std::get<0>(Fle(args...))))
1073 // Note: for floating point numbers Flt(b, a) and !Fle(a, b) produce different and incompatible
1074 // results. IEEE754-2008 defined NOT (!=) predicate as negation of EQ (==) predicate while GT (>)
1075 // and GE (>=) are not negations of LE (<) or GT (<=) predicated but instead use swap of arguments.
1076 // Note that scalar form includes only three predicates (Feq, Fle, Fgt) while vector form includes
1077 // Vmfgt.vf and Vmfge.vf instructions only for vector+scalar case (vector+vector case is supposed
1078 // to be handled by swapping arguments). More here: https://github.com/riscv/riscv-v-spec/issues/300
1079 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fgt, auto [arg1, arg2] = std::tuple{args...};
1080                                    using IntType = typename TypeTraits<ElementType>::Int;
1081                                    (~IntType{0}) * IntType(std::get<0>(Flt(arg2, arg1))))
1082 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fge, auto [arg1, arg2] = std::tuple{args...};
1083                                    using IntType = typename TypeTraits<ElementType>::Int;
1084                                    (~IntType{0}) * IntType(std::get<0>(Fle(arg2, arg1))))
1085 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1086     seq,
1087     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args == ...))})
1088 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1089     seq,
1090     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args == ...))})
1091 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1092     sne,
1093     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args != ...))})
1094 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1095     sne,
1096     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args != ...))})
1097 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1098     slt,
1099     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args < ...))})
1100 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1101     slt,
1102     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args < ...))})
1103 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1104     sle,
1105     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args <= ...))})
1106 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1107     sle,
1108     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args <= ...))})
1109 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1110     sgt,
1111     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args > ...))})
1112 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sl, auto [arg1, arg2] = std::tuple{args...}; (arg1 << arg2))
1113 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sl, auto [arg1, arg2] = std::tuple{args...}; (arg1 << arg2))
1114 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sr, auto [arg1, arg2] = std::tuple{args...}; (arg1 >> arg2))
1115 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sr, auto [arg1, arg2] = std::tuple{args...}; (arg1 >> arg2))
1116 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1117                                    ((arg2 * arg1) + arg3))
1118 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1119                                    ((arg2 * arg1) + arg3))
1120 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1121                                    (-(arg2 * arg1) + arg3))
1122 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1123                                    (-(arg2 * arg1) + arg3))
1124 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(madd, auto [arg1, arg2, arg3] = std::tuple{args...};
1125                                    ((arg2 * arg3) + arg1))
1126 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(madd, auto [arg1, arg2, arg3] = std::tuple{args...};
1127                                    ((arg2 * arg3) + arg1))
1128 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1129                                    (-(arg2 * arg3) + arg1))
1130 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1131                                    (-(arg2 * arg3) + arg1))
1132 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1133                                         std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1134 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1135                                         std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1136 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1137                                         std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1138 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1139                                         std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1140 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1141                                         std::get<0>(FMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1142 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1143                                         std::get<0>(FMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1144 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1145                                         std::get<0>(FMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1146 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1147                                         std::get<0>(FMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1148 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1149                                         std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1150 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1151                                         std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1152 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1153                                         std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1154 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1155                                         std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1156 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1157                                         std::get<0>(FNMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1158 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1159                                         std::get<0>(FNMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1160 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1161                                         std::get<0>(FNMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1162 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1163                                         std::get<0>(FNMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1164 
1165 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmin, std::get<0>(FMin(args...)))
1166 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmin, std::get<0>(FMin(args...)))
1167 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(fredmin, std::get<0>(FMin(args...)))
1168 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmax, std::get<0>(FMax(args...)))
1169 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmax, std::get<0>(FMax(args...)))
1170 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(fredmax, std::get<0>(FMax(args...)))
1171 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnj, std::get<0>(FSgnj(args...)))
1172 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnj, std::get<0>(FSgnj(args...)))
1173 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjn, std::get<0>(FSgnjn(args...)))
1174 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnjn, std::get<0>(FSgnjn(args...)))
1175 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjx, std::get<0>(FSgnjx(args...)))
1176 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnjx, std::get<0>(FSgnjx(args...)))
1177 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(min, std::min(args...))
1178 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(min, std::min(args...))
1179 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redmin, std::min(args...))
1180 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(max, std::max(args...))
1181 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(max, std::max(args...))
1182 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redmax, std::max(args...))
1183 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mul, auto [arg1, arg2] = std::tuple{args...}; (arg2 * arg1))
1184 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mul, auto [arg1, arg2] = std::tuple{args...}; (arg2 * arg1))
1185 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mulh, auto [arg1, arg2] = std::tuple{args...};
1186                                    NarrowTopHalf(Widen(arg2) * Widen(arg1)))
1187 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mulh, auto [arg1, arg2] = std::tuple{args...};
1188                                    NarrowTopHalf(Widen(arg2) * Widen(arg1)))
1189 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mulhsu, auto [arg1, arg2] = std::tuple{args...};
1190                                    NarrowTopHalf(BitCastToUnsigned(Widen(BitCastToSigned(arg1))) *
1191                                                  Widen(BitCastToUnsigned(arg2))))
1192 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mulhsu, auto [arg1, arg2] = std::tuple{args...};
1193                                    NarrowTopHalf(BitCastToUnsigned(Widen(BitCastToSigned(arg1))) *
1194                                                  Widen(BitCastToUnsigned(arg2))))
1195 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1196     div,
1197     ElementType{std::get<0>(Div(static_cast<typename ElementType::BaseType>(args)...))})
1198 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1199     div,
1200     ElementType{std::get<0>(Div(static_cast<typename ElementType::BaseType>(args)...))})
1201 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1202     rem,
1203     ElementType{std::get<0>(Rem(static_cast<typename ElementType::BaseType>(args)...))})
1204 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1205     rem,
1206     ElementType{std::get<0>(Rem(static_cast<typename ElementType::BaseType>(args)...))})
1207 
1208 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(add, (args + ...))
1209 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(add, (args + ...))
1210 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(add, (args + ...))
1211 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(add, (args + ...))
1212 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(sub, (args - ...))
1213 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(sub, (args - ...))
1214 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(sub, (args - ...))
1215 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(sub, (args - ...))
1216 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(mul, (args * ...))
1217 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(mulsu, std::get<0>(WideMultiplySignedUnsigned(args...)))
1218 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(mul, (args * ...))
1219 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(mulsu, std::get<0>(WideMultiplySignedUnsigned(args...)))
1220 
1221 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1222                                           (arg1 * arg2) + arg3)
1223 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1224                                           (arg1 * arg2) + arg3)
1225 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(maccsu, auto [arg1, arg2, arg3] = std::tuple{args...};
1226                                           (std::get<0>(WideMultiplySignedUnsigned(arg2, arg1))) +
1227                                           arg3)
1228 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccsu, auto [arg1, arg2, arg3] = std::tuple{args...};
1229                                           (std::get<0>(WideMultiplySignedUnsigned(arg2, arg1))) +
1230                                           arg3)
1231 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccus, auto [arg1, arg2, arg3] = std::tuple{args...};
1232                                           (std::get<0>(WideMultiplySignedUnsigned(arg1, arg2))) +
1233                                           arg3)
1234 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1235     macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1236     std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1237 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1238     macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1239     std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1240 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1241     nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1242     std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1243 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1244     nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1245     std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1246 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1247     msac, auto [arg1, arg2, arg3] = std::tuple{args...};
1248     std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1249 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1250     msac, auto [arg1, arg2, arg3] = std::tuple{args...};
1251     std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1252 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1253     nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1254     std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1255 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1256     nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1257     std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1258 DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(sr, auto [arg1, arg2] = std::tuple{args...};
1259                                           (arg1 >> arg2))
1260 DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(sr, auto [arg1, arg2] = std::tuple{args...};
1261                                           (arg1 >> arg2))
1262 DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(
1263     clip,
1264     WideType<ElementType>{(std::get<0>(
1265         Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))})
1266 DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(
1267     clip,
1268     WideType<ElementType>{(std::get<0>(
1269         Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))})
1270 
1271 #undef DEFINE_ARITHMETIC_INTRINSIC
1272 #undef DEFINE_W_ARITHMETIC_INTRINSIC
1273 #undef DEFINE_ARITHMETIC_REDUCE_INTRINSIC
1274 #undef DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS
1275 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_V
1276 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VS
1277 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VV
1278 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VV
1279 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VX
1280 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VX
1281 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_X
1282 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF
1283 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS
1284 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX
1285 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV
1286 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV
1287 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX
1288 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV
1289 #undef DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF
1290 #undef DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV
1291 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV
1292 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF
1293 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV
1294 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF
1295 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW
1296 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV
1297 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX
1298 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX
1299 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW
1300 #undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW
1301 #undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW
1302 
1303 }  // namespace berberis::intrinsics
1304 
1305 #endif  // BERBERIS_INTRINSICS_RISCV64_VECTOR_INTRINSICS_H_
1306