1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef BERBERIS_INTRINSICS_RISCV64_VECTOR_INTRINSICS_H_
18 #define BERBERIS_INTRINSICS_RISCV64_VECTOR_INTRINSICS_H_
19
20 #include <algorithm>
21 #include <climits> // CHAR_BIT
22 #include <cstdint>
23 #include <limits>
24 #include <tuple>
25 #include <type_traits>
26
27 #include "berberis/base/bit_util.h"
28 #include "berberis/base/dependent_false.h"
29 #include "berberis/intrinsics/intrinsics.h" // PreferredIntrinsicsImplementation
30 #include "berberis/intrinsics/intrinsics_float.h" // Float32/Float64
31 #include "berberis/intrinsics/simd_register.h"
32 #include "berberis/intrinsics/type_traits.h"
33
34 namespace berberis::intrinsics {
35
36 enum class TailProcessing {
37 kUndisturbed = 0,
38 kAgnostic = 1,
39 };
40
41 enum class InactiveProcessing {
42 kUndisturbed = 0,
43 kAgnostic = 1,
44 };
45
46 enum class NoInactiveProcessing {
47 kNoInactiveProcessing = 0,
48 };
49
50 template <typename ElementType>
FullMaskForRegister(NoInactiveProcessing)51 [[nodiscard]] inline std::tuple<NoInactiveProcessing> FullMaskForRegister(NoInactiveProcessing) {
52 return {NoInactiveProcessing{}};
53 }
54
55 template <typename ElementType>
56 [[nodiscard]] inline std::tuple<
57 std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
FullMaskForRegister(SIMD128Register)58 FullMaskForRegister(SIMD128Register) {
59 if constexpr (sizeof(ElementType) == sizeof(uint8_t)) {
60 return {{0xffff}};
61 } else if constexpr (sizeof(ElementType) == sizeof(uint16_t)) {
62 return {{0xff}};
63 } else if constexpr (sizeof(ElementType) == sizeof(uint32_t)) {
64 return {{0xf}};
65 } else if constexpr (sizeof(ElementType) == sizeof(uint64_t)) {
66 return {{0x3}};
67 } else {
68 static_assert(kDependentTypeFalse<ElementType>, "Unsupported vector element type");
69 }
70 }
71
72 template <typename ElementType>
MaskForRegisterInSequence(NoInactiveProcessing,size_t)73 [[nodiscard]] inline std::tuple<NoInactiveProcessing> MaskForRegisterInSequence(
74 NoInactiveProcessing,
75 size_t) {
76 return {NoInactiveProcessing{}};
77 }
78
79 template <typename ElementType>
80 [[nodiscard]] inline std::tuple<
81 std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
MaskForRegisterInSequence(SIMD128Register mask,size_t register_in_sequence)82 MaskForRegisterInSequence(SIMD128Register mask, size_t register_in_sequence) {
83 if constexpr (sizeof(ElementType) == sizeof(uint8_t)) {
84 return {mask.Get<RawInt16>(register_in_sequence)};
85 } else if constexpr (sizeof(ElementType) == sizeof(uint16_t)) {
86 return {mask.Get<RawInt8>(register_in_sequence)};
87 } else if constexpr (sizeof(ElementType) == sizeof(uint32_t)) {
88 return {RawInt8{TruncateTo<UInt8>(mask.Get<UInt32>(0) >> UInt64(register_in_sequence * 4)) &
89 UInt8{0b1111}}};
90 } else if constexpr (sizeof(ElementType) == sizeof(uint64_t)) {
91 return {RawInt8{TruncateTo<UInt8>(mask.Get<UInt32>(0) >> UInt64(register_in_sequence * 2)) &
92 UInt8{0b11}}};
93 } else {
94 static_assert(kDependentTypeFalse<ElementType>, "Unsupported vector element type");
95 }
96 }
97
98 // Naïve implementation for tests. Also used on not-x86 platforms.
MakeBitmaskFromVlForTests(size_t vl)99 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVlForTests(size_t vl) {
100 if (vl == 128) {
101 return {SIMD128Register(__int128(0))};
102 } else {
103 return {SIMD128Register((~__int128(0)) << vl)};
104 }
105 }
106
107 #ifndef __x86_64__
MakeBitmaskFromVl(size_t vl)108 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVl(size_t vl) {
109 return {MakeBitmaskFromVlForTests(vl)};
110 }
111 #endif
112
113 template <typename ElementType>
MakeBitmaskFromVl(size_t vl)114 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVl(size_t vl) {
115 return MakeBitmaskFromVl(vl * sizeof(ElementType) * CHAR_BIT);
116 }
117
118 // Naïve implementation for tests. Also used on not-x86 platforms.
119 template <typename ElementType>
BitMaskToSimdMaskForTests(size_t mask)120 [[nodiscard]] inline std::tuple<SIMD128Register> BitMaskToSimdMaskForTests(size_t mask) {
121 constexpr ElementType kZeroValue = ElementType{0};
122 constexpr ElementType kFillValue = ~ElementType{0};
123 SIMD128Register result;
124 for (size_t index = 0; index < sizeof(SIMD128Register) / sizeof(ElementType); ++index) {
125 size_t bit = 1 << index;
126 if (mask & bit) {
127 result.Set(kFillValue, index);
128 } else {
129 result.Set(kZeroValue, index);
130 }
131 }
132 return {result};
133 }
134
135 #ifndef __x86_64__
136 template <typename ElementType>
BitMaskToSimdMask(size_t mask)137 [[nodiscard]] inline std::tuple<SIMD128Register> BitMaskToSimdMask(size_t mask) {
138 return {BitMaskToSimdMaskForTests<ElementType>(mask)};
139 }
140 #endif
141
142 // Naïve implementation for tests. Also used on not-x86 platforms.
143 template <typename ElementType>
144 [[nodiscard]] inline std::tuple<
145 std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
SimdMaskToBitMaskForTests(SIMD128Register simd_mask)146 SimdMaskToBitMaskForTests(SIMD128Register simd_mask) {
147 using ResultType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
148 ResultType mask{0};
149 constexpr ResultType kElementsCount{
150 static_cast<uint8_t>(sizeof(SIMD128Register) / sizeof(ElementType))};
151 for (ResultType index{0}; index < kElementsCount; index += ResultType{1}) {
152 if (simd_mask.Get<ElementType>(index) != ElementType{}) {
153 mask |= ResultType{1} << ResultType{index};
154 }
155 }
156 return mask;
157 }
158
159 #ifndef __SSSE3__
160 template <typename ElementType>
161 [[nodiscard]] inline std::tuple<
162 std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
SimdMaskToBitMask(SIMD128Register simd_mask)163 SimdMaskToBitMask(SIMD128Register simd_mask) {
164 return SimdMaskToBitMaskForTests<ElementType>(simd_mask);
165 }
166 #endif
167
168 template <auto kElement>
VectorMaskedElementToForTests(SIMD128Register simd_mask,SIMD128Register result)169 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMaskedElementToForTests(
170 SIMD128Register simd_mask,
171 SIMD128Register result) {
172 using ElementType = decltype(kElement);
173 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
174 for (size_t index = 0; index < kElementsCount; ++index) {
175 if (!simd_mask.Get<ElementType>(index)) {
176 result.Set(kElement, index);
177 }
178 }
179 return result;
180 }
181
182 #ifndef __x86_64__
183 template <typename ElementType>
VectorMaskedElementTo(SIMD128Register simd_mask,SIMD128Register result)184 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMaskedElementTo(SIMD128Register simd_mask,
185 SIMD128Register result) {
186 return VectorMaskedElementToForTests(simd_mask, result);
187 }
188 #endif
189
190 template <typename ElementType>
VectorElement(SIMD128Register src,int index)191 [[nodiscard]] inline ElementType VectorElement(SIMD128Register src, int index) {
192 return src.Get<ElementType>(index);
193 }
194
195 template <typename ElementType>
VectorElement(ElementType src,int)196 [[nodiscard]] inline ElementType VectorElement(ElementType src, int) {
197 return src;
198 }
199
200 template <typename ElementType>
VMovTopHalfToBottom(SIMD128Register src)201 [[nodiscard]] inline std::tuple<SIMD128Register> VMovTopHalfToBottom(SIMD128Register src) {
202 return {SIMD128Register{src.Get<uint64_t>(1)}};
203 }
204
205 template <typename ElementType>
VMergeBottomHalfToTop(SIMD128Register bottom,SIMD128Register top)206 [[nodiscard]] inline std::tuple<SIMD128Register> VMergeBottomHalfToTop(SIMD128Register bottom,
207 SIMD128Register top) {
208 SIMD128Register result{bottom};
209 result.Set<uint64_t>(top.Get<uint64_t>(0), 1);
210 return result;
211 }
212
213 // Naïve implementation for tests. Also used on not-x86 platforms.
214 template <auto kDefaultElement>
VectorBroadcastForTests()215 [[nodiscard]] inline std::tuple<SIMD128Register> VectorBroadcastForTests() {
216 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof kDefaultElement;
217 SIMD128Register dest;
218 for (size_t index = 0; index < kElementsCount; ++index) {
219 dest.Set(kDefaultElement, index);
220 }
221 return dest;
222 }
223
224 #ifndef __x86_64__
225 template <auto kDefaultElement>
VectorBroadcast()226 [[nodiscard]] inline std::tuple<SIMD128Register> VectorBroadcast() {
227 return VectorBroadcastForTests<kDefaultElement>();
228 }
229 #endif
230
231 template <auto kDefaultElement, TailProcessing vta, NoInactiveProcessing = NoInactiveProcessing{}>
VectorMasking(SIMD128Register result,int vstart,int vl)232 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register result,
233 int vstart,
234 int vl) {
235 constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof kDefaultElement);
236 if (vstart < 0) {
237 vstart = 0;
238 }
239 if (vl < 0) {
240 vl = 0;
241 }
242 if (vl > kElementsCount) {
243 vl = kElementsCount;
244 }
245 if constexpr (kDefaultElement == decltype(kDefaultElement){}) {
246 if (vstart == 0) [[likely]] {
247 if (vl != kElementsCount) [[unlikely]] {
248 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
249 result &= ~tail_bitmask;
250 }
251 } else if (vstart >= vl) [[unlikely]] {
252 // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
253 // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
254 // kElementsCount we hit that corner case and return zero if that happens.
255 result = SIMD128Register{};
256 } else {
257 // Note: vstart < vl here because RISC-V instructions don't alter the result if vstart >= vl.
258 CHECK_LT(vstart, vl);
259 const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
260 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
261 result &= start_bitmask;
262 result &= ~tail_bitmask;
263 }
264 } else if constexpr (kDefaultElement == ~decltype(kDefaultElement){}) {
265 if (vstart == 0) [[likely]] {
266 if (vl != kElementsCount) [[unlikely]] {
267 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
268 result |= tail_bitmask;
269 }
270 } else if (vstart >= vl) [[unlikely]] {
271 // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
272 // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
273 // kElementsCount we hit that corner case and return zero if that happens.
274 result = ~SIMD128Register{};
275 } else {
276 // Note: vstart < vl here because RISC-V instructions don't alter the result if vstart >= vl.
277 CHECK_LT(vstart, vl);
278 const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
279 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
280 result |= ~start_bitmask;
281 result |= tail_bitmask;
282 }
283 } else {
284 const std::tuple<SIMD128Register>& dest = VectorBroadcast<kDefaultElement>();
285 if (vstart == 0) [[likely]] {
286 if (vl != kElementsCount) [[unlikely]] {
287 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
288 result &= ~tail_bitmask;
289 result |= (std::get<0>(dest) & tail_bitmask);
290 }
291 } else if (vstart >= vl) [[unlikely]] {
292 // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
293 // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
294 // kElementsCount we hit that corner case and return dest if that happens.
295 result = std::get<0>(dest);
296 } else {
297 const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
298 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
299 result &= start_bitmask;
300 result &= ~tail_bitmask;
301 result |= (std::get<0>(dest) & (~start_bitmask | tail_bitmask));
302 }
303 }
304 return result;
305 }
306
307 template <auto kDefaultElement,
308 TailProcessing vta,
309 auto vma = NoInactiveProcessing{},
310 typename MaskType>
311 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register result,
312 int vstart,
313 int vl,
314 MaskType mask) {
315 static_assert((std::is_same_v<decltype(vma), NoInactiveProcessing> &&
316 std::is_same_v<MaskType, NoInactiveProcessing>) ||
317 (std::is_same_v<decltype(vma), InactiveProcessing> &&
318 (std::is_same_v<MaskType, RawInt8> || std::is_same_v<MaskType, RawInt16>)));
319 if constexpr (std::is_same_v<decltype(vma), InactiveProcessing>) {
320 const auto [simd_mask] = BitMaskToSimdMask<decltype(kDefaultElement)>(
321 static_cast<typename MaskType::BaseType>(mask));
322 if constexpr (kDefaultElement == ~decltype(kDefaultElement){}) {
323 result |= ~simd_mask;
324 } else {
325 result &= simd_mask;
326 if constexpr (kDefaultElement != decltype(kDefaultElement){}) {
327 const std::tuple<SIMD128Register>& dest = VectorBroadcast<kDefaultElement>();
328 result |= std::get<0>(dest) & ~simd_mask;
329 }
330 }
331 }
332 return VectorMasking<kDefaultElement, vta>(result, vstart, vl);
333 }
334
335 template <typename ElementType, TailProcessing vta, NoInactiveProcessing = NoInactiveProcessing{}>
336 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(
337 SIMD128Register dest,
338 SIMD128Register result,
339 int vstart,
340 int vl,
341 NoInactiveProcessing /*mask*/ = NoInactiveProcessing{}) {
342 constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(ElementType));
343 if (vstart < 0) {
344 vstart = 0;
345 }
346 if (vl < 0) {
347 vl = 0;
348 }
349 if (vl > kElementsCount) {
350 vl = kElementsCount;
351 }
352 if (vstart == 0) [[likely]] {
353 if (vl == kElementsCount) [[likely]] {
354 return result;
355 }
356 const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
357 if constexpr (vta == TailProcessing::kAgnostic) {
358 dest = result | tail_bitmask;
359 } else {
360 dest = (dest & tail_bitmask) | (result & ~tail_bitmask);
361 }
362 } else if (vstart < vl) [[likely]] {
363 // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
364 // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
365 // kElementsCount we hit that corner case and return dest if that happens.
366 const auto [start_bitmask] = MakeBitmaskFromVl<ElementType>(vstart);
367 const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
368 if constexpr (vta == TailProcessing::kAgnostic) {
369 dest = (dest & ~start_bitmask) | (result & start_bitmask) | tail_bitmask;
370 } else {
371 dest = (dest & (~start_bitmask | tail_bitmask)) | (result & start_bitmask & ~tail_bitmask);
372 }
373 } else if constexpr (vta == TailProcessing::kAgnostic) {
374 if (vstart == vl) {
375 // Corners case where vstart == vl may happen because of vslideup:
376 // https://github.com/riscv/riscv-v-spec/issues/263
377 const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
378 dest |= tail_bitmask;
379 }
380 }
381 return {dest};
382 }
383
384 template <typename ElementType,
385 TailProcessing vta,
386 auto vma = NoInactiveProcessing{},
387 typename MaskType = NoInactiveProcessing>
388 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(
389 SIMD128Register dest,
390 SIMD128Register result,
391 SIMD128Register result_mask,
392 int vstart,
393 int vl,
394 MaskType mask = NoInactiveProcessing{}) {
395 static_assert((std::is_same_v<decltype(vma), NoInactiveProcessing> &&
396 std::is_same_v<MaskType, NoInactiveProcessing>) ||
397 (std::is_same_v<decltype(vma), InactiveProcessing> &&
398 (std::is_same_v<MaskType, RawInt8> || std::is_same_v<MaskType, RawInt16>)));
399 if constexpr (std::is_same_v<decltype(vma), InactiveProcessing>) {
400 const auto [simd_mask] =
401 BitMaskToSimdMask<ElementType>(static_cast<typename MaskType::BaseType>(mask));
402 if (vma == InactiveProcessing::kAgnostic) {
403 result |= ~simd_mask;
404 } else {
405 result = (result & simd_mask) | (result_mask & ~simd_mask);
406 }
407 }
408 return VectorMasking<ElementType, vta>(dest, result, vstart, vl);
409 }
410
411 template <typename ElementType, TailProcessing vta, InactiveProcessing vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,int vstart,int vl,MaskType mask)412 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register dest,
413 SIMD128Register result,
414 int vstart,
415 int vl,
416 MaskType mask) {
417 return VectorMasking<ElementType, vta, vma>(dest,
418 result,
419 /*result_mask=*/dest,
420 vstart,
421 vl,
422 mask);
423 }
424
425 template <typename ElementType, typename... ParameterType>
426 inline constexpr bool kIsAllowedArgumentForVector =
427 ((std::is_same_v<ParameterType, SIMD128Register> ||
428 std::is_same_v<ParameterType, ElementType>)&&...);
429
430 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
431 template <typename ElementType, typename Lambda, typename... ParameterType>
VectorProcessing(Lambda lambda,ParameterType...parameters)432 inline std::tuple<SIMD128Register> VectorProcessing(Lambda lambda, ParameterType... parameters) {
433 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
434 SIMD128Register result;
435 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
436 for (size_t index = 0; index < kElementsCount; ++index) {
437 result.Set(lambda(VectorElement<ElementType>(parameters, index)...), index);
438 }
439 return result;
440 }
441
442 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
443 template <typename ElementType, typename Lambda, typename ResultType, typename... ParameterType>
VectorProcessingReduce(Lambda lambda,ResultType init,ParameterType...parameters)444 inline std::tuple<ResultType> VectorProcessingReduce(Lambda lambda,
445 ResultType init,
446 ParameterType... parameters) {
447 static_assert(std::is_same_v<ResultType, ElementType> ||
448 std::is_same_v<ResultType, WideType<ElementType>>);
449 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
450 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
451 for (size_t index = 0; index < kElementsCount; ++index) {
452 init = lambda(init, VectorElement<ElementType>(parameters, index)...);
453 }
454 return init;
455 }
456
457 // SEW = 2*SEW op SEW
458 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
459 template <typename ElementType, typename Lambda, typename ParameterType1, typename ParameterType2>
VectorArithmeticNarrowwv(Lambda lambda,ParameterType1 src1,ParameterType2 src2)460 inline std::tuple<SIMD128Register> VectorArithmeticNarrowwv(Lambda lambda,
461 ParameterType1 src1,
462 ParameterType2 src2) {
463 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2>);
464 SIMD128Register result;
465 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
466 for (size_t index = 0; index < kElementsCount; ++index) {
467 result.Set(Narrow(lambda(VectorElement<WideType<ElementType>>(src1, index),
468 Widen(VectorElement<ElementType>(src2, index)))),
469 index);
470 }
471 return result;
472 }
473
474 // 2*SEW = SEW op SEW
475 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
476 template <typename ElementType, typename Lambda, typename... ParameterType>
VectorArithmeticWidenvv(Lambda lambda,ParameterType...parameters)477 inline std::tuple<SIMD128Register> VectorArithmeticWidenvv(Lambda lambda,
478 ParameterType... parameters) {
479 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
480 SIMD128Register result;
481 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
482 for (size_t index = 0; index < kElementsCount; ++index) {
483 result.Set(lambda(Widen(VectorElement<ElementType>(parameters, index))...), index);
484 }
485 return result;
486 }
487
488 // 2*SEW = SEW op SEW op 2*SEW
489 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
490 template <typename ElementType,
491 typename Lambda,
492 typename ParameterType1,
493 typename ParameterType2,
494 typename ParameterType3>
VectorArithmeticWidenvvw(Lambda lambda,ParameterType1 src1,ParameterType2 src2,ParameterType3 src3)495 inline std::tuple<SIMD128Register> VectorArithmeticWidenvvw(Lambda lambda,
496 ParameterType1 src1,
497 ParameterType2 src2,
498 ParameterType3 src3) {
499 static_assert(
500 kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2, ParameterType3>);
501 SIMD128Register result;
502 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
503 for (size_t index = 0; index < kElementsCount; ++index) {
504 result.Set(lambda(Widen(VectorElement<ElementType>(src1, index)),
505 Widen(VectorElement<ElementType>(src2, index)),
506 VectorElement<WideType<ElementType>>(src3, index)),
507 index);
508 }
509 return result;
510 }
511
512 // SEW = 2*SEW op SEW
513 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
514 template <typename ElementType, typename Lambda, typename ParameterType1, typename ParameterType2>
VectorArithmeticWidenwv(Lambda lambda,ParameterType1 src1,ParameterType2 src2)515 inline std::tuple<SIMD128Register> VectorArithmeticWidenwv(Lambda lambda,
516 ParameterType1 src1,
517 ParameterType2 src2) {
518 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2>);
519 SIMD128Register result;
520 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
521 for (size_t index = 0; index < kElementsCount; ++index) {
522 result.Set(lambda(VectorElement<WideType<ElementType>>(src1, index),
523 Widen(VectorElement<ElementType>(src2, index))),
524 index);
525 }
526 return result;
527 }
528
529 template <typename ElementType>
VectorExtend(SIMD128Register src)530 SIMD128Register VectorExtend(SIMD128Register src) {
531 SIMD128Register result;
532 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
533 for (size_t index = 0; index < kElementsCount; ++index) {
534 result.Set(Widen(VectorElement<ElementType>(src, index)), index);
535 }
536 return result;
537 }
538
539 template <typename ElementType,
540 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf2(SIMD128Register src)541 inline std::tuple<SIMD128Register> Vextf2(SIMD128Register src) {
542 using SourceElementType = NarrowType<ElementType>;
543 return {VectorExtend<SourceElementType>(src)};
544 }
545
546 template <typename ElementType,
547 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf4(SIMD128Register src)548 inline std::tuple<SIMD128Register> Vextf4(SIMD128Register src) {
549 using WideSourceElementType = NarrowType<ElementType>;
550 using SourceElementType = NarrowType<WideSourceElementType>;
551 return {VectorExtend<WideSourceElementType>(VectorExtend<SourceElementType>(src))};
552 }
553
554 template <typename ElementType,
555 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf8(SIMD128Register src)556 inline std::tuple<SIMD128Register> Vextf8(SIMD128Register src) {
557 using WideWideSourceElementType = NarrowType<ElementType>;
558 return {
559 VectorExtend<WideWideSourceElementType>(std::get<0>(Vextf4<WideWideSourceElementType>(src)))};
560 }
561
562 template <typename ElementType,
563 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
VidvForTests(size_t index)564 inline std::tuple<SIMD128Register> VidvForTests(size_t index) {
565 SIMD128Register result;
566 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
567 ElementType element = {static_cast<typename ElementType::BaseType>(index * kElementsCount)};
568 for (size_t index = 0; index < kElementsCount; ++index) {
569 result.Set(element, index);
570 element += ElementType{1};
571 }
572 return result;
573 }
574
575 // Handles "slide up" for a single destination register. Effectively copies the last offset elements
576 // in [kElementsCount - offset, kElementsCount) of src1 followed by the first [0, kElementsCount -
577 // offset) elements of src2 into the result.
578 //
579 // This leaves result looking like
580 //
581 // result = {
582 // src1[kElementsCount-offset+0],
583 // src1[kElementsCount-offset+1],
584 // ...,
585 // src1[kElementsCount-offset+(offset-1),
586 // src2[0],
587 // src2[1],
588 // ...,
589 // src2[kElementsCount-offset-1]
590 // };
591 template <typename ElementType>
VectorSlideUp(size_t offset,SIMD128Register src1,SIMD128Register src2)592 inline std::tuple<SIMD128Register> VectorSlideUp(size_t offset,
593 SIMD128Register src1,
594 SIMD128Register src2) {
595 SIMD128Register result;
596 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
597 CHECK_LT(offset, kElementsCount);
598 for (size_t index = 0; index < offset; ++index) {
599 result.Set(VectorElement<ElementType>(src1, kElementsCount - offset + index), index);
600 }
601 for (size_t index = offset; index < kElementsCount; ++index) {
602 result.Set(VectorElement<ElementType>(src2, index - offset), index);
603 }
604 return result;
605 }
606
607 // Handles "slide down" for a single destination register. Effectively copies the elements in
608 // [offset, kElementsCount) of src1 followed by the [0, kElementsCount - offset) elements of src2
609 // into the result.
610 //
611 // This leaves result looking like
612 //
613 // result = {
614 // [0] = src1[offset+0],
615 // [1] = src1[offset+1],
616 // ...,
617 // [kElementsCount-offset-1] = src1[kElementsCount-1],
618 // [kElementsCount-offset] = src2[0],
619 // [kElementsCount-offset+1] = src2[1],
620 // ...,
621 // [kElementsCount-offset+(offset-1)] = src2[kElementsCount-offset-1]
622 // };
623 template <typename ElementType>
VectorSlideDown(size_t offset,SIMD128Register src1,SIMD128Register src2)624 inline std::tuple<SIMD128Register> VectorSlideDown(size_t offset,
625 SIMD128Register src1,
626 SIMD128Register src2) {
627 SIMD128Register result;
628 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
629 CHECK_LT(offset, kElementsCount);
630 for (size_t index = 0; index < kElementsCount - offset; ++index) {
631 result.Set(VectorElement<ElementType>(src1, offset + index), index);
632 }
633 for (size_t index = kElementsCount - offset; index < kElementsCount; ++index) {
634 result.Set(VectorElement<ElementType>(src2, index - (kElementsCount - offset)), index);
635 }
636 return result;
637 }
638
639 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vcpopm(SIMD128Register simd_src)640 inline std::tuple<SIMD128Register> Vcpopm(SIMD128Register simd_src) {
641 UInt128 src = simd_src.Get<UInt128>();
642 return Popcount(src);
643 }
644
645 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vfirstm(SIMD128Register simd_src)646 inline std::tuple<SIMD128Register> Vfirstm(SIMD128Register simd_src) {
647 UInt128 src = simd_src.Get<UInt128>();
648 if (src == Int128{0}) {
649 return ~UInt128{0};
650 }
651 return CountRZero(src);
652 }
653
654 #ifndef __x86_64__
655 template <typename ElementType,
656 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vidv(size_t index)657 inline std::tuple<SIMD128Register> Vidv(size_t index) {
658 return VidvForTests<ElementType>(index);
659 }
660 #endif
661
662 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsifm(SIMD128Register simd_src)663 inline std::tuple<SIMD128Register> Vmsifm(SIMD128Register simd_src) {
664 Int128 src = simd_src.Get<Int128>();
665 return {(src - Int128{1}) ^ src};
666 }
667
668 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsbfm(SIMD128Register simd_src)669 inline std::tuple<SIMD128Register> Vmsbfm(SIMD128Register simd_src) {
670 Int128 src = simd_src.Get<Int128>();
671 if (src == Int128{0}) {
672 return {~Int128{0}};
673 }
674 return {std::get<0>(Vmsifm(simd_src)).Get<Int128>() >> Int128{1}};
675 }
676
677 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsofm(SIMD128Register simd_src)678 inline std::tuple<SIMD128Register> Vmsofm(SIMD128Register simd_src) {
679 return {std::get<0>(Vmsbfm(simd_src)) ^ std::get<0>(Vmsifm(simd_src))};
680 }
681
682 template <typename ElementType,
683 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Viotam(SIMD128Register simd_src,size_t counter)684 inline std::tuple<SIMD128Register, size_t> Viotam(SIMD128Register simd_src, size_t counter) {
685 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
686 __uint128_t src = simd_src.Get<__uint128_t>();
687 SIMD128Register result;
688 for (size_t index = 0; index < kElementsCount; ++index) {
689 typename Wrapping<typename ElementType::BaseType>::UnsignedType value{
690 static_cast<typename ElementType::BaseType>(counter)};
691 result.Set(value, index);
692 counter += static_cast<size_t>(src & 1);
693 src >>= 1;
694 }
695 return {result, counter};
696 }
697
698 template <typename TargetElementType,
699 typename SourceElementType,
700 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vfcvtv(int8_t rm,int8_t frm,SIMD128Register src)701 inline std::tuple<SIMD128Register> Vfcvtv(int8_t rm, int8_t frm, SIMD128Register src) {
702 SIMD128Register result;
703 size_t kElementsCount = std::min(sizeof(SIMD128Register) / sizeof(TargetElementType),
704 sizeof(SIMD128Register) / sizeof(SourceElementType));
705 for (size_t index = 0; index < kElementsCount; ++index) {
706 if constexpr (!std::is_same_v<TargetElementType, Float16> &&
707 !std::is_same_v<TargetElementType, Float32> &&
708 !std::is_same_v<TargetElementType, Float64>) {
709 result.Set(
710 std::get<0>(FCvtFloatToInteger<typename TargetElementType::BaseType, SourceElementType>(
711 rm, frm, src.Get<SourceElementType>(index))),
712 index);
713 } else if constexpr (!std::is_same_v<SourceElementType, Float16> &&
714 !std::is_same_v<SourceElementType, Float32> &&
715 !std::is_same_v<SourceElementType, Float64>) {
716 result.Set(
717 std::get<0>(FCvtIntegerToFloat<TargetElementType, typename SourceElementType::BaseType>(
718 rm, frm, src.Get<typename SourceElementType::BaseType>(index))),
719 index);
720 } else {
721 result.Set(std::get<0>(FCvtFloatToFloat<TargetElementType, SourceElementType>(
722 rm, frm, src.Get<SourceElementType>(index))),
723 index);
724 }
725 }
726 return result;
727 }
728
729 // With wide intrinsics multiplication we may do sign-extension or zero-extension, but some
730 // intrinsics need mix: Signed * Unsigned. We narrow down value and then extend it again.
731 // Compiler is smart enough to eliminate dead code.
732 template <typename ElementType>
WideMultiplySignedUnsigned(ElementType arg1,ElementType arg2)733 std::tuple<ElementType> WideMultiplySignedUnsigned(ElementType arg1, ElementType arg2) {
734 return BitCastToUnsigned(Widen(BitCastToSigned(Narrow(arg1)))) *
735 Widen(BitCastToUnsigned(Narrow(arg2)));
736 }
737
738 #define DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS(...) __VA_ARGS__
739 #define DEFINE_ARITHMETIC_INTRINSIC(Name, arithmetic, parameters, capture, arguments) \
740 template <typename ElementType, \
741 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> \
742 inline std::tuple<SIMD128Register> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
743 return VectorProcessing<ElementType>( \
744 [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) { \
745 static_assert((std::is_same_v<decltype(args), ElementType> && ...)); \
746 arithmetic; \
747 }, \
748 DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments); \
749 }
750
751 #define DEFINE_1OP_ARITHMETIC_INTRINSIC_V(name, ...) \
752 DEFINE_ARITHMETIC_INTRINSIC(V##name##v, return ({ __VA_ARGS__; }); \
753 , (SIMD128Register src), (), (src))
754
755 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(name, ...) \
756 DEFINE_ARITHMETIC_INTRINSIC(V##name##vv, return ({ __VA_ARGS__; }); \
757 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
758
759 #define DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(name, ...) \
760 DEFINE_ARITHMETIC_INTRINSIC(V##name##vv, return ({ __VA_ARGS__; }); \
761 , \
762 (SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
763 (), \
764 (src1, src2, src3))
765
766 #define DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(name, ...) \
767 DEFINE_ARITHMETIC_INTRINSIC( \
768 V##name##vv, return ({ __VA_ARGS__; }); \
769 , \
770 (int8_t csr, SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
771 (csr), \
772 (src1, src2, src3))
773
774 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(name, ...) \
775 DEFINE_ARITHMETIC_INTRINSIC(V##name##vx, return ({ __VA_ARGS__; }); \
776 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
777
778 #define DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(name, ...) \
779 DEFINE_ARITHMETIC_INTRINSIC( \
780 V##name##vx, return ({ __VA_ARGS__; }); \
781 , (SIMD128Register src1, ElementType src2, SIMD128Register src3), (), (src1, src2, src3))
782
783 #define DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(name, ...) \
784 DEFINE_ARITHMETIC_INTRINSIC( \
785 V##name##vf, return ({ __VA_ARGS__; }); \
786 , \
787 (int8_t csr, SIMD128Register src1, ElementType src2, SIMD128Register src3), \
788 (csr), \
789 (src1, src2, src3))
790
791 #define DEFINE_1OP_ARITHMETIC_INTRINSIC_X(name, ...) \
792 DEFINE_ARITHMETIC_INTRINSIC(V##name##x, return ({ __VA_ARGS__; });, (ElementType src), (), (src))
793
794 #define DEFINE_1OP_1CSR_ARITHMETIC_INTRINSIC_V(name, ...) \
795 DEFINE_ARITHMETIC_INTRINSIC(V##name##v, return ({ __VA_ARGS__; }); \
796 , (int8_t csr, SIMD128Register src), (csr), (src))
797
798 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(name, ...) \
799 DEFINE_ARITHMETIC_INTRINSIC( \
800 V##name##vf, return ({ __VA_ARGS__; }); \
801 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
802
803 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(name, ...) \
804 DEFINE_ARITHMETIC_INTRINSIC( \
805 V##name##vv, return ({ __VA_ARGS__; }); \
806 , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
807
808 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(name, ...) \
809 DEFINE_ARITHMETIC_INTRINSIC( \
810 V##name##vx, return ({ __VA_ARGS__; }); \
811 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
812
813 #define DEFINE_ARITHMETIC_REDUCE_INTRINSIC(Name, arithmetic, parameters, capture, arguments) \
814 template <typename ElementType, \
815 typename ResultType = ElementType, \
816 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> \
817 inline std::tuple<ResultType> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
818 return VectorProcessingReduce<ElementType>( \
819 [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) { \
820 static_assert((std::is_same_v<decltype(args), ElementType> && ...)); \
821 arithmetic; \
822 }, \
823 DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments); \
824 }
825
826 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(name, ...) \
827 DEFINE_ARITHMETIC_REDUCE_INTRINSIC(V##name##vs, return ({ __VA_ARGS__; }); \
828 , (ResultType init, SIMD128Register src), (), (init, src))
829
830 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(name, ...) \
831 DEFINE_ARITHMETIC_REDUCE_INTRINSIC( \
832 Vfred##name##vs, return ({ __VA_ARGS__; }); \
833 , (int8_t csr, ResultType init, SIMD128Register src), (csr), (init, src))
834
835 #define DEFINE_W_ARITHMETIC_INTRINSIC(Name, Pattern, arithmetic, parameters, capture, arguments) \
836 template <typename ElementType, \
837 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> \
838 inline std::tuple<SIMD128Register> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
839 return VectorArithmetic##Pattern<ElementType>( \
840 [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) { \
841 static_assert((std::is_same_v<decltype(args), WideType<ElementType>> && ...)); \
842 arithmetic; \
843 }, \
844 DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments); \
845 }
846
847 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...) \
848 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
849 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
850
851 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...) \
852 DEFINE_W_ARITHMETIC_INTRINSIC( \
853 Vfw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
854 , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
855
856 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(name, ...) \
857 DEFINE_W_ARITHMETIC_INTRINSIC( \
858 Vfw##name##vf, Widenvv, return ({ __VA_ARGS__; }); \
859 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
860
861 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(name, ...) \
862 DEFINE_W_ARITHMETIC_INTRINSIC( \
863 Vfw##name##wv, Widenwv, return ({ __VA_ARGS__; }); \
864 , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
865
866 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(name, ...) \
867 DEFINE_W_ARITHMETIC_INTRINSIC( \
868 Vfw##name##wf, Widenwv, return ({ __VA_ARGS__; }); \
869 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
870
871 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(name, ...) \
872 DEFINE_W_ARITHMETIC_INTRINSIC( \
873 Vw##name##vv, Widenvvw, return ({ __VA_ARGS__; }); \
874 , \
875 (SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
876 (), \
877 (src1, src2, src3))
878
879 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(name, ...) \
880 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vx, Widenvv, return ({ __VA_ARGS__; }); \
881 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
882
883 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(name, ...) \
884 DEFINE_W_ARITHMETIC_INTRINSIC( \
885 Vw##name##vx, Widenvvw, return ({ __VA_ARGS__; }); \
886 , (SIMD128Register src1, ElementType src2, SIMD128Register src3), (), (src1, src2, src3))
887
888 #define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(name, ...) \
889 DEFINE_W_ARITHMETIC_INTRINSIC( \
890 Vfw##name##vv, Widenvvw, return ({ __VA_ARGS__; }); \
891 , \
892 (int8_t csr, SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
893 (csr), \
894 (src1, src2, src3))
895
896 #define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(name, ...) \
897 DEFINE_W_ARITHMETIC_INTRINSIC( \
898 Vfw##name##vf, Widenvvw, return ({ __VA_ARGS__; }); \
899 , \
900 (int8_t csr, SIMD128Register src1, ElementType src2, SIMD128Register src3), \
901 (csr), \
902 (src1, src2, src3))
903
904 #define DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...) \
905 DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; }); \
906 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
907
908 #define DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...) \
909 DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; }); \
910 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
911
912 #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...) \
913 DEFINE_W_ARITHMETIC_INTRINSIC( \
914 Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; }); \
915 , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
916
917 #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...) \
918 DEFINE_W_ARITHMETIC_INTRINSIC( \
919 Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; }); \
920 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
921
922 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...) \
923 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
924 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
925
926 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(name, ...) \
927 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##wv, Widenwv, return ({ __VA_ARGS__; }); \
928 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
929
930 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(name, ...) \
931 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##wx, Widenwv, return ({ __VA_ARGS__; }); \
932 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
933
934 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(name, ...) \
935 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vx, Widenvv, return ({ __VA_ARGS__; }); \
936 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
937
938 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(copy, auto [arg] = std::tuple{args...}; arg)
939 DEFINE_1OP_ARITHMETIC_INTRINSIC_X(copy, auto [arg] = std::tuple{args...}; arg)
940 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(frsqrt7, RSqrtEstimate(args...))
941 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(
942 fclass,
943 static_cast<typename TypeTraits<ElementType>::Int>(std::get<0>(FClass(args...))))
944
945 DEFINE_1OP_1CSR_ARITHMETIC_INTRINSIC_V(fsqrt,
946 CanonicalizeNanTuple(FSqrt(FPFlags::DYN, csr, args...)))
947
948 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(add, (args + ...))
949 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(add, (args + ...))
950 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redsum, (args + ...))
951 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(rsub, auto [arg1, arg2] = std::tuple{args...}; (arg2 - arg1))
952 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sub, (args - ...))
953 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sub, (args - ...))
954 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(and, (args & ...))
955 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(and, (args & ...))
956 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redand, (args & ...))
957 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(or, (args | ...))
958 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(or, (args | ...))
959 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redor, (args | ...))
960 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(xor, (args ^ ...))
961 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(xor, (args ^ ...))
962 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redxor, (args ^ ...))
963 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
964 aadd,
965 ElementType{std::get<0>(Aadd(csr, static_cast<typename ElementType::BaseType>(args)...))})
966 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
967 aadd,
968 ElementType{std::get<0>(Aadd(csr, static_cast<typename ElementType::BaseType>(args)...))})
969
970 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(smul, auto [arg1, arg2] = std::tuple{args...}; ElementType{
971 Narrow(Saturating{std::get<0>(Roundoff(
972 csr,
973 static_cast<typename WideType<ElementType>::BaseType>(Widen(arg1) * Widen(arg2)),
974 static_cast<typename WideType<ElementType>::BaseType>((sizeof(ElementType) * CHAR_BIT) -
975 1)))})})
976
977 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(smul, auto [arg1, arg2] = std::tuple{args...}; ElementType{
978 Narrow(Saturating{std::get<0>(Roundoff(
979 csr,
980 static_cast<typename WideType<ElementType>::BaseType>(Widen(arg1) * Widen(arg2)),
981 static_cast<typename WideType<ElementType>::BaseType>((sizeof(ElementType) * CHAR_BIT) -
982 1)))})})
983
984 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
985 ssr,
986 ElementType{std::get<0>(Roundoff(csr, static_cast<typename ElementType::BaseType>(args)...))})
987
988 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
989 ssr,
990 ElementType{std::get<0>(Roundoff(csr, static_cast<typename ElementType::BaseType>(args)...))})
991
992 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fadd,
993 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
994 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fadd,
995 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
996 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
997 add,
998 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
999 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1000 add,
1001 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1002 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1003 sub,
1004 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1005 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1006 sub,
1007 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1008 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1009 mul,
1010 CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1011 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1012 mul,
1013 CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1014 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(
1015 add,
1016 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1017 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(
1018 add,
1019 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1020 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(
1021 sub,
1022 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1023 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(
1024 sub,
1025 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1026
1027 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fsub,
1028 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1029 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fsub,
1030 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1031 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(frsub, auto [arg1, arg2] = std::tuple{args...};
1032 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, arg2, arg1)))
1033 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(osum,
1034 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1035 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(usum,
1036 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1037 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
1038 asub,
1039 ElementType{std::get<0>(Asub(csr, static_cast<typename ElementType::BaseType>(args)...))})
1040 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
1041 asub,
1042 ElementType{std::get<0>(Asub(csr, static_cast<typename ElementType::BaseType>(args)...))})
1043 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmul,
1044 CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1045 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmul,
1046 CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1047 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fdiv,
1048 CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, args...)))
1049 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fdiv,
1050 CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, args...)))
1051 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(frdiv, auto [arg1, arg2] = std::tuple{args...};
1052 CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, arg2, arg1)))
1053 // SIMD mask either includes results with all bits set to 0 or all bits set to 1.
1054 // This way it may be used with VAnd and VAndN operations to perform masking.
1055 // Such comparison is effectively one instruction of x86-64 (via SSE or AVX) but
1056 // to achieve it we need to multiply bool result by (~IntType{0}) or (~ElementType{0}).
1057 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(feq, using IntType = typename TypeTraits<ElementType>::Int;
1058 (~IntType{0}) * IntType(std::get<0>(Feq(args...))))
1059 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(feq, using IntType = typename TypeTraits<ElementType>::Int;
1060 (~IntType{0}) * IntType(std::get<0>(Feq(args...))))
1061 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fne, using IntType = typename TypeTraits<ElementType>::Int;
1062 (~IntType{0}) * IntType(!std::get<0>(Feq(args...))))
1063 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fne, using IntType = typename TypeTraits<ElementType>::Int;
1064 (~IntType{0}) * IntType(!std::get<0>(Feq(args...))))
1065 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(flt, using IntType = typename TypeTraits<ElementType>::Int;
1066 (~IntType{0}) * IntType(std::get<0>(Flt(args...))))
1067 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(flt, using IntType = typename TypeTraits<ElementType>::Int;
1068 (~IntType{0}) * IntType(std::get<0>(Flt(args...))))
1069 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fle, using IntType = typename TypeTraits<ElementType>::Int;
1070 (~IntType{0}) * IntType(std::get<0>(Fle(args...))))
1071 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fle, using IntType = typename TypeTraits<ElementType>::Int;
1072 (~IntType{0}) * IntType(std::get<0>(Fle(args...))))
1073 // Note: for floating point numbers Flt(b, a) and !Fle(a, b) produce different and incompatible
1074 // results. IEEE754-2008 defined NOT (!=) predicate as negation of EQ (==) predicate while GT (>)
1075 // and GE (>=) are not negations of LE (<) or GT (<=) predicated but instead use swap of arguments.
1076 // Note that scalar form includes only three predicates (Feq, Fle, Fgt) while vector form includes
1077 // Vmfgt.vf and Vmfge.vf instructions only for vector+scalar case (vector+vector case is supposed
1078 // to be handled by swapping arguments). More here: https://github.com/riscv/riscv-v-spec/issues/300
1079 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fgt, auto [arg1, arg2] = std::tuple{args...};
1080 using IntType = typename TypeTraits<ElementType>::Int;
1081 (~IntType{0}) * IntType(std::get<0>(Flt(arg2, arg1))))
1082 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fge, auto [arg1, arg2] = std::tuple{args...};
1083 using IntType = typename TypeTraits<ElementType>::Int;
1084 (~IntType{0}) * IntType(std::get<0>(Fle(arg2, arg1))))
1085 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1086 seq,
1087 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args == ...))})
1088 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1089 seq,
1090 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args == ...))})
1091 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1092 sne,
1093 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args != ...))})
1094 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1095 sne,
1096 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args != ...))})
1097 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1098 slt,
1099 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args < ...))})
1100 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1101 slt,
1102 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args < ...))})
1103 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1104 sle,
1105 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args <= ...))})
1106 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1107 sle,
1108 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args <= ...))})
1109 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1110 sgt,
1111 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args > ...))})
1112 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sl, auto [arg1, arg2] = std::tuple{args...}; (arg1 << arg2))
1113 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sl, auto [arg1, arg2] = std::tuple{args...}; (arg1 << arg2))
1114 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sr, auto [arg1, arg2] = std::tuple{args...}; (arg1 >> arg2))
1115 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sr, auto [arg1, arg2] = std::tuple{args...}; (arg1 >> arg2))
1116 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1117 ((arg2 * arg1) + arg3))
1118 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1119 ((arg2 * arg1) + arg3))
1120 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1121 (-(arg2 * arg1) + arg3))
1122 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1123 (-(arg2 * arg1) + arg3))
1124 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(madd, auto [arg1, arg2, arg3] = std::tuple{args...};
1125 ((arg2 * arg3) + arg1))
1126 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(madd, auto [arg1, arg2, arg3] = std::tuple{args...};
1127 ((arg2 * arg3) + arg1))
1128 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1129 (-(arg2 * arg3) + arg1))
1130 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1131 (-(arg2 * arg3) + arg1))
1132 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1133 std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1134 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1135 std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1136 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1137 std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1138 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1139 std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1140 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1141 std::get<0>(FMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1142 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1143 std::get<0>(FMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1144 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1145 std::get<0>(FMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1146 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1147 std::get<0>(FMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1148 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1149 std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1150 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1151 std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1152 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1153 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1154 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1155 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1156 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1157 std::get<0>(FNMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1158 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1159 std::get<0>(FNMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1160 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1161 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1162 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1163 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1164
1165 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmin, std::get<0>(FMin(args...)))
1166 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmin, std::get<0>(FMin(args...)))
1167 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(fredmin, std::get<0>(FMin(args...)))
1168 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmax, std::get<0>(FMax(args...)))
1169 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmax, std::get<0>(FMax(args...)))
1170 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(fredmax, std::get<0>(FMax(args...)))
1171 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnj, std::get<0>(FSgnj(args...)))
1172 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnj, std::get<0>(FSgnj(args...)))
1173 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjn, std::get<0>(FSgnjn(args...)))
1174 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnjn, std::get<0>(FSgnjn(args...)))
1175 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjx, std::get<0>(FSgnjx(args...)))
1176 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnjx, std::get<0>(FSgnjx(args...)))
1177 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(min, std::min(args...))
1178 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(min, std::min(args...))
1179 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redmin, std::min(args...))
1180 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(max, std::max(args...))
1181 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(max, std::max(args...))
1182 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redmax, std::max(args...))
1183 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mul, auto [arg1, arg2] = std::tuple{args...}; (arg2 * arg1))
1184 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mul, auto [arg1, arg2] = std::tuple{args...}; (arg2 * arg1))
1185 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mulh, auto [arg1, arg2] = std::tuple{args...};
1186 NarrowTopHalf(Widen(arg2) * Widen(arg1)))
1187 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mulh, auto [arg1, arg2] = std::tuple{args...};
1188 NarrowTopHalf(Widen(arg2) * Widen(arg1)))
1189 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mulhsu, auto [arg1, arg2] = std::tuple{args...};
1190 NarrowTopHalf(BitCastToUnsigned(Widen(BitCastToSigned(arg1))) *
1191 Widen(BitCastToUnsigned(arg2))))
1192 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mulhsu, auto [arg1, arg2] = std::tuple{args...};
1193 NarrowTopHalf(BitCastToUnsigned(Widen(BitCastToSigned(arg1))) *
1194 Widen(BitCastToUnsigned(arg2))))
1195 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1196 div,
1197 ElementType{std::get<0>(Div(static_cast<typename ElementType::BaseType>(args)...))})
1198 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1199 div,
1200 ElementType{std::get<0>(Div(static_cast<typename ElementType::BaseType>(args)...))})
1201 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1202 rem,
1203 ElementType{std::get<0>(Rem(static_cast<typename ElementType::BaseType>(args)...))})
1204 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1205 rem,
1206 ElementType{std::get<0>(Rem(static_cast<typename ElementType::BaseType>(args)...))})
1207
1208 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(add, (args + ...))
1209 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(add, (args + ...))
1210 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(add, (args + ...))
1211 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(add, (args + ...))
1212 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(sub, (args - ...))
1213 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(sub, (args - ...))
1214 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(sub, (args - ...))
1215 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(sub, (args - ...))
1216 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(mul, (args * ...))
1217 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(mulsu, std::get<0>(WideMultiplySignedUnsigned(args...)))
1218 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(mul, (args * ...))
1219 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(mulsu, std::get<0>(WideMultiplySignedUnsigned(args...)))
1220
1221 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1222 (arg1 * arg2) + arg3)
1223 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1224 (arg1 * arg2) + arg3)
1225 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(maccsu, auto [arg1, arg2, arg3] = std::tuple{args...};
1226 (std::get<0>(WideMultiplySignedUnsigned(arg2, arg1))) +
1227 arg3)
1228 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccsu, auto [arg1, arg2, arg3] = std::tuple{args...};
1229 (std::get<0>(WideMultiplySignedUnsigned(arg2, arg1))) +
1230 arg3)
1231 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccus, auto [arg1, arg2, arg3] = std::tuple{args...};
1232 (std::get<0>(WideMultiplySignedUnsigned(arg1, arg2))) +
1233 arg3)
1234 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1235 macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1236 std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1237 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1238 macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1239 std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1240 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1241 nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1242 std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1243 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1244 nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1245 std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1246 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1247 msac, auto [arg1, arg2, arg3] = std::tuple{args...};
1248 std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1249 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1250 msac, auto [arg1, arg2, arg3] = std::tuple{args...};
1251 std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1252 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1253 nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1254 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1255 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1256 nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1257 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1258 DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(sr, auto [arg1, arg2] = std::tuple{args...};
1259 (arg1 >> arg2))
1260 DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(sr, auto [arg1, arg2] = std::tuple{args...};
1261 (arg1 >> arg2))
1262 DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(
1263 clip,
1264 WideType<ElementType>{(std::get<0>(
1265 Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))})
1266 DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(
1267 clip,
1268 WideType<ElementType>{(std::get<0>(
1269 Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))})
1270
1271 #undef DEFINE_ARITHMETIC_INTRINSIC
1272 #undef DEFINE_W_ARITHMETIC_INTRINSIC
1273 #undef DEFINE_ARITHMETIC_REDUCE_INTRINSIC
1274 #undef DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS
1275 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_V
1276 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VS
1277 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VV
1278 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VV
1279 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VX
1280 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VX
1281 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_X
1282 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF
1283 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS
1284 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX
1285 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV
1286 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV
1287 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX
1288 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV
1289 #undef DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF
1290 #undef DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV
1291 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV
1292 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF
1293 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV
1294 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF
1295 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW
1296 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV
1297 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX
1298 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX
1299 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW
1300 #undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW
1301 #undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW
1302
1303 } // namespace berberis::intrinsics
1304
1305 #endif // BERBERIS_INTRINSICS_RISCV64_VECTOR_INTRINSICS_H_
1306