1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2019-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief 8x32-bit vectors, implemented using AVX2.
20 *
21 * This module implements 8-wide 32-bit float, int, and mask vectors for x86
22 * AVX2.
23 *
24 * There is a baseline level of functionality provided by all vector widths and
25 * implementations. This is implemented using identical function signatures,
26 * modulo data type, so we can use them as substitutable implementations in VLA
27 * code.
28 */
29
30 #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
31 #define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
32
33 #ifndef ASTCENC_SIMD_INLINE
34 #error "Include astcenc_vecmathlib.h, do not include directly"
35 #endif
36
37 #include <cstdio>
38
39 // Define convenience intrinsics that are missing on older compilers
40 #define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
41
42 // ============================================================================
43 // vfloat8 data type
44 // ============================================================================
45
46 /**
47 * @brief Data type for 8-wide floats.
48 */
49 struct vfloat8
50 {
51 /**
52 * @brief Construct from zero-initialized value.
53 */
54 ASTCENC_SIMD_INLINE vfloat8() = default;
55
56 /**
57 * @brief Construct from 4 values loaded from an unaligned address.
58 *
59 * Consider using loada() which is better with vectors if data is aligned
60 * to vector length.
61 */
vfloat8vfloat862 ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
63 {
64 m = _mm256_loadu_ps(p);
65 }
66
67 /**
68 * @brief Construct from 1 scalar value replicated across all lanes.
69 *
70 * Consider using zero() for constexpr zeros.
71 */
vfloat8vfloat872 ASTCENC_SIMD_INLINE explicit vfloat8(float a)
73 {
74 m = _mm256_set1_ps(a);
75 }
76
77 /**
78 * @brief Construct from 8 scalar values.
79 *
80 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
81 */
vfloat8vfloat882 ASTCENC_SIMD_INLINE explicit vfloat8(
83 float a, float b, float c, float d,
84 float e, float f, float g, float h)
85 {
86 m = _mm256_set_ps(h, g, f, e, d, c, b, a);
87 }
88
89 /**
90 * @brief Construct from an existing SIMD register.
91 */
vfloat8vfloat892 ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
93 {
94 m = a;
95 }
96
97 /**
98 * @brief Get the scalar value of a single lane.
99 */
lanevfloat8100 template <int l> ASTCENC_SIMD_INLINE float lane() const
101 {
102 #if !defined(__clang__) && defined(_MSC_VER)
103 return m.m256_f32[l];
104 #else
105 union { __m256 m; float f[8]; } cvt;
106 cvt.m = m;
107 return cvt.f[l];
108 #endif
109 }
110
111 /**
112 * @brief Factory that returns a vector of zeros.
113 */
zerovfloat8114 static ASTCENC_SIMD_INLINE vfloat8 zero()
115 {
116 return vfloat8(_mm256_setzero_ps());
117 }
118
119 /**
120 * @brief Factory that returns a replicated scalar loaded from memory.
121 */
load1vfloat8122 static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
123 {
124 return vfloat8(_mm256_broadcast_ss(p));
125 }
126
127 /**
128 * @brief Factory that returns a vector loaded from 32B aligned memory.
129 */
loadavfloat8130 static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
131 {
132 return vfloat8(_mm256_load_ps(p));
133 }
134
135 /**
136 * @brief Factory that returns a vector containing the lane IDs.
137 */
lane_idvfloat8138 static ASTCENC_SIMD_INLINE vfloat8 lane_id()
139 {
140 return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
141 }
142
143 /**
144 * @brief The vector ...
145 */
146 __m256 m;
147 };
148
149 // ============================================================================
150 // vint8 data type
151 // ============================================================================
152
153 /**
154 * @brief Data type for 8-wide ints.
155 */
156 struct vint8
157 {
158 /**
159 * @brief Construct from zero-initialized value.
160 */
161 ASTCENC_SIMD_INLINE vint8() = default;
162
163 /**
164 * @brief Construct from 8 values loaded from an unaligned address.
165 *
166 * Consider using loada() which is better with vectors if data is aligned
167 * to vector length.
168 */
vint8vint8169 ASTCENC_SIMD_INLINE explicit vint8(const int *p)
170 {
171 m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
172 }
173
174 /**
175 * @brief Construct from 8 uint8_t loaded from an unaligned address.
176 */
vint8vint8177 ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
178 {
179 // _mm_loadu_si64 would be nicer syntax, but missing on older GCC
180 m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
181 }
182
183 /**
184 * @brief Construct from 1 scalar value replicated across all lanes.
185 *
186 * Consider using vfloat4::zero() for constexpr zeros.
187 */
vint8vint8188 ASTCENC_SIMD_INLINE explicit vint8(int a)
189 {
190 m = _mm256_set1_epi32(a);
191 }
192
193 /**
194 * @brief Construct from 8 scalar values.
195 *
196 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
197 */
vint8vint8198 ASTCENC_SIMD_INLINE explicit vint8(
199 int a, int b, int c, int d,
200 int e, int f, int g, int h)
201 {
202 m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
203 }
204
205 /**
206 * @brief Construct from an existing SIMD register.
207 */
vint8vint8208 ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
209 {
210 m = a;
211 }
212
213 /**
214 * @brief Get the scalar from a single lane.
215 */
lanevint8216 template <int l> ASTCENC_SIMD_INLINE int lane() const
217 {
218 #if !defined(__clang__) && defined(_MSC_VER)
219 return m.m256i_i32[l];
220 #else
221 union { __m256i m; int f[8]; } cvt;
222 cvt.m = m;
223 return cvt.f[l];
224 #endif
225 }
226
227 /**
228 * @brief Factory that returns a vector of zeros.
229 */
zerovint8230 static ASTCENC_SIMD_INLINE vint8 zero()
231 {
232 return vint8(_mm256_setzero_si256());
233 }
234
235 /**
236 * @brief Factory that returns a replicated scalar loaded from memory.
237 */
load1vint8238 static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
239 {
240 __m128i a = _mm_set1_epi32(*p);
241 return vint8(_mm256_broadcastd_epi32(a));
242 }
243
244 /**
245 * @brief Factory that returns a vector loaded from 32B aligned memory.
246 */
loadavint8247 static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
248 {
249 return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
250 }
251
252 /**
253 * @brief Factory that returns a vector containing the lane IDs.
254 */
lane_idvint8255 static ASTCENC_SIMD_INLINE vint8 lane_id()
256 {
257 return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
258 }
259
260 /**
261 * @brief The vector ...
262 */
263 __m256i m;
264 };
265
266 // ============================================================================
267 // vmask8 data type
268 // ============================================================================
269
270 /**
271 * @brief Data type for 8-wide control plane masks.
272 */
273 struct vmask8
274 {
275 /**
276 * @brief Construct from an existing SIMD register.
277 */
vmask8vmask8278 ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
279 {
280 m = a;
281 }
282
283 /**
284 * @brief Construct from an existing SIMD register.
285 */
vmask8vmask8286 ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
287 {
288 m = _mm256_castsi256_ps(a);
289 }
290
291 /**
292 * @brief Construct from 1 scalar value.
293 */
vmask8vmask8294 ASTCENC_SIMD_INLINE explicit vmask8(bool a)
295 {
296 vint8 mask(a == false ? 0 : -1);
297 m = _mm256_castsi256_ps(mask.m);
298 }
299
300 /**
301 * @brief The vector ...
302 */
303 __m256 m;
304 };
305
306 // ============================================================================
307 // vmask8 operators and functions
308 // ============================================================================
309
310 /**
311 * @brief Overload: mask union (or).
312 */
313 ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
314 {
315 return vmask8(_mm256_or_ps(a.m, b.m));
316 }
317
318 /**
319 * @brief Overload: mask intersect (and).
320 */
321 ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
322 {
323 return vmask8(_mm256_and_ps(a.m, b.m));
324 }
325
326 /**
327 * @brief Overload: mask difference (xor).
328 */
329 ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
330 {
331 return vmask8(_mm256_xor_ps(a.m, b.m));
332 }
333
334 /**
335 * @brief Overload: mask invert (not).
336 */
337 ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
338 {
339 return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
340 }
341
342 /**
343 * @brief Return a 8-bit mask code indicating mask status.
344 *
345 * bit0 = lane 0
346 */
mask(vmask8 a)347 ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
348 {
349 return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
350 }
351
352 /**
353 * @brief True if any lanes are enabled, false otherwise.
354 */
any(vmask8 a)355 ASTCENC_SIMD_INLINE bool any(vmask8 a)
356 {
357 return mask(a) != 0;
358 }
359
360 /**
361 * @brief True if all lanes are enabled, false otherwise.
362 */
all(vmask8 a)363 ASTCENC_SIMD_INLINE bool all(vmask8 a)
364 {
365 return mask(a) == 0xFF;
366 }
367
368 // ============================================================================
369 // vint8 operators and functions
370 // ============================================================================
371 /**
372 * @brief Overload: vector by vector addition.
373 */
374 ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
375 {
376 return vint8(_mm256_add_epi32(a.m, b.m));
377 }
378
379 /**
380 * @brief Overload: vector by vector incremental addition.
381 */
382 ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
383 {
384 a = a + b;
385 return a;
386 }
387
388 /**
389 * @brief Overload: vector by vector subtraction.
390 */
391 ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
392 {
393 return vint8(_mm256_sub_epi32(a.m, b.m));
394 }
395
396 /**
397 * @brief Overload: vector by vector multiplication.
398 */
399 ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
400 {
401 return vint8(_mm256_mullo_epi32(a.m, b.m));
402 }
403
404 /**
405 * @brief Overload: vector bit invert.
406 */
407 ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
408 {
409 return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
410 }
411
412 /**
413 * @brief Overload: vector by vector bitwise or.
414 */
415 ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
416 {
417 return vint8(_mm256_or_si256(a.m, b.m));
418 }
419
420 /**
421 * @brief Overload: vector by vector bitwise and.
422 */
423 ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
424 {
425 return vint8(_mm256_and_si256(a.m, b.m));
426 }
427
428 /**
429 * @brief Overload: vector by vector bitwise xor.
430 */
431 ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
432 {
433 return vint8(_mm256_xor_si256(a.m, b.m));
434 }
435
436 /**
437 * @brief Overload: vector by vector equality.
438 */
439 ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
440 {
441 return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
442 }
443
444 /**
445 * @brief Overload: vector by vector inequality.
446 */
447 ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
448 {
449 return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
450 }
451
452 /**
453 * @brief Overload: vector by vector less than.
454 */
455 ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
456 {
457 return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
458 }
459
460 /**
461 * @brief Overload: vector by vector greater than.
462 */
463 ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
464 {
465 return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
466 }
467
468 /**
469 * @brief Logical shift left.
470 */
lsl(vint8 a)471 template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
472 {
473 return vint8(_mm256_slli_epi32(a.m, s));
474 }
475
476 /**
477 * @brief Arithmetic shift right.
478 */
asr(vint8 a)479 template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
480 {
481 return vint8(_mm256_srai_epi32(a.m, s));
482 }
483
484 /**
485 * @brief Logical shift right.
486 */
lsr(vint8 a)487 template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
488 {
489 return vint8(_mm256_srli_epi32(a.m, s));
490 }
491
492 /**
493 * @brief Return the min vector of two vectors.
494 */
min(vint8 a,vint8 b)495 ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
496 {
497 return vint8(_mm256_min_epi32(a.m, b.m));
498 }
499
500 /**
501 * @brief Return the max vector of two vectors.
502 */
max(vint8 a,vint8 b)503 ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
504 {
505 return vint8(_mm256_max_epi32(a.m, b.m));
506 }
507
508 /**
509 * @brief Return the horizontal minimum of a vector.
510 */
hmin(vint8 a)511 ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
512 {
513 __m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
514 m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
515 m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
516 m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
517
518 __m256i r = astcenc_mm256_set_m128i(m, m);
519 vint8 vmin(r);
520 return vmin;
521 }
522
523 /**
524 * @brief Return the horizontal maximum of a vector.
525 */
hmax(vint8 a)526 ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
527 {
528 __m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
529 m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
530 m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
531 m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
532
533 __m256i r = astcenc_mm256_set_m128i(m, m);
534 vint8 vmax(r);
535 return vmax;
536 }
537
538 /**
539 * @brief Store a vector to a 16B aligned memory address.
540 */
storea(vint8 a,int * p)541 ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
542 {
543 _mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
544 }
545
546 /**
547 * @brief Store a vector to an unaligned memory address.
548 */
store(vint8 a,int * p)549 ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
550 {
551 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
552 }
553
554 /**
555 * @brief Store lowest N (vector width) bytes into an unaligned address.
556 */
store_nbytes(vint8 a,uint8_t * p)557 ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
558 {
559 // This is the most logical implementation, but the convenience intrinsic
560 // is missing on older compilers (supported in g++ 9 and clang++ 9).
561 // _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
562 _mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
563 }
564
565 /**
566 * @brief Gather N (vector width) indices from the array.
567 */
gatheri(const int * base,vint8 indices)568 ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
569 {
570 return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
571 }
572
573 /**
574 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
575 */
pack_low_bytes(vint8 v)576 ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
577 {
578 __m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
579 0, 0, 0, 0, 28, 24, 20, 16,
580 0, 0, 0, 0, 0, 0, 0, 0,
581 0, 0, 0, 0, 12, 8, 4, 0);
582 __m256i a = _mm256_shuffle_epi8(v.m, shuf);
583 __m128i a0 = _mm256_extracti128_si256(a, 0);
584 __m128i a1 = _mm256_extracti128_si256(a, 1);
585 __m128i b = _mm_unpacklo_epi32(a0, a1);
586
587 __m256i r = astcenc_mm256_set_m128i(b, b);
588 return vint8(r);
589 }
590
591 /**
592 * @brief Return lanes from @c b if @c cond is set, else @c a.
593 */
select(vint8 a,vint8 b,vmask8 cond)594 ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
595 {
596 __m256i condi = _mm256_castps_si256(cond.m);
597 return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
598 }
599
600 // ============================================================================
601 // vfloat4 operators and functions
602 // ============================================================================
603
604 /**
605 * @brief Overload: vector by vector addition.
606 */
607 ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
608 {
609 return vfloat8(_mm256_add_ps(a.m, b.m));
610 }
611
612 /**
613 * @brief Overload: vector by vector incremental addition.
614 */
615 ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
616 {
617 a = a + b;
618 return a;
619 }
620
621 /**
622 * @brief Overload: vector by vector subtraction.
623 */
624 ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
625 {
626 return vfloat8(_mm256_sub_ps(a.m, b.m));
627 }
628
629 /**
630 * @brief Overload: vector by vector multiplication.
631 */
632 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
633 {
634 return vfloat8(_mm256_mul_ps(a.m, b.m));
635 }
636
637 /**
638 * @brief Overload: vector by scalar multiplication.
639 */
640 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
641 {
642 return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
643 }
644
645 /**
646 * @brief Overload: scalar by vector multiplication.
647 */
648 ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
649 {
650 return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
651 }
652
653 /**
654 * @brief Overload: vector by vector division.
655 */
656 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
657 {
658 return vfloat8(_mm256_div_ps(a.m, b.m));
659 }
660
661 /**
662 * @brief Overload: vector by scalar division.
663 */
664 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
665 {
666 return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
667 }
668
669
670 /**
671 * @brief Overload: scalar by vector division.
672 */
673 ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
674 {
675 return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
676 }
677
678
679 /**
680 * @brief Overload: vector by vector equality.
681 */
682 ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
683 {
684 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
685 }
686
687 /**
688 * @brief Overload: vector by vector inequality.
689 */
690 ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
691 {
692 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
693 }
694
695 /**
696 * @brief Overload: vector by vector less than.
697 */
698 ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
699 {
700 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
701 }
702
703 /**
704 * @brief Overload: vector by vector greater than.
705 */
706 ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
707 {
708 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
709 }
710
711 /**
712 * @brief Overload: vector by vector less than or equal.
713 */
714 ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
715 {
716 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
717 }
718
719 /**
720 * @brief Overload: vector by vector greater than or equal.
721 */
722 ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
723 {
724 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
725 }
726
727 /**
728 * @brief Return the min vector of two vectors.
729 *
730 * If either lane value is NaN, @c b will be returned for that lane.
731 */
min(vfloat8 a,vfloat8 b)732 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
733 {
734 return vfloat8(_mm256_min_ps(a.m, b.m));
735 }
736
737 /**
738 * @brief Return the min vector of a vector and a scalar.
739 *
740 * If either lane value is NaN, @c b will be returned for that lane.
741 */
min(vfloat8 a,float b)742 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
743 {
744 return min(a, vfloat8(b));
745 }
746
747 /**
748 * @brief Return the max vector of two vectors.
749 *
750 * If either lane value is NaN, @c b will be returned for that lane.
751 */
max(vfloat8 a,vfloat8 b)752 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
753 {
754 return vfloat8(_mm256_max_ps(a.m, b.m));
755 }
756
757 /**
758 * @brief Return the max vector of a vector and a scalar.
759 *
760 * If either lane value is NaN, @c b will be returned for that lane.
761 */
max(vfloat8 a,float b)762 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
763 {
764 return max(a, vfloat8(b));
765 }
766
767 /**
768 * @brief Return the clamped value between min and max.
769 *
770 * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
771 * then @c min will be returned for that lane.
772 */
clamp(float min,float max,vfloat8 a)773 ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
774 {
775 // Do not reorder - second operand will return if either is NaN
776 a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
777 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
778 return a;
779 }
780
781 /**
782 * @brief Return a clamped value between 0.0f and max.
783 *
784 * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
785 * be returned for that lane.
786 */
clampz(float max,vfloat8 a)787 ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
788 {
789 a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
790 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
791 return a;
792 }
793
794 /**
795 * @brief Return a clamped value between 0.0f and 1.0f.
796 *
797 * If @c a is NaN then zero will be returned for that lane.
798 */
clampzo(vfloat8 a)799 ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
800 {
801 a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
802 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
803 return a;
804 }
805
806 /**
807 * @brief Return the absolute value of the float vector.
808 */
abs(vfloat8 a)809 ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
810 {
811 __m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
812 return vfloat8(_mm256_and_ps(a.m, msk));
813 }
814
815 /**
816 * @brief Return a float rounded to the nearest integer value.
817 */
round(vfloat8 a)818 ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
819 {
820 constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
821 return vfloat8(_mm256_round_ps(a.m, flags));
822 }
823
824 /**
825 * @brief Return the horizontal minimum of a vector.
826 */
hmin(vfloat8 a)827 ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
828 {
829 __m128 vlow = _mm256_castps256_ps128(a.m);
830 __m128 vhigh = _mm256_extractf128_ps(a.m, 1);
831 vlow = _mm_min_ps(vlow, vhigh);
832
833 // First do an horizontal reduction.
834 __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
835 __m128 mins = _mm_min_ps(vlow, shuf);
836 shuf = _mm_movehl_ps(shuf, mins);
837 mins = _mm_min_ss(mins, shuf);
838
839 // This is the most logical implementation, but the convenience intrinsic
840 // is missing on older compilers (supported in g++ 9 and clang++ 9).
841 //__m256i r = _mm256_set_m128(m, m)
842 __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
843
844 return vfloat8(_mm256_permute_ps(r, 0));
845 }
846
847 /**
848 * @brief Return the horizontal minimum of a vector.
849 */
hmin_s(vfloat8 a)850 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
851 {
852 return hmin(a).lane<0>();
853 }
854
855 /**
856 * @brief Return the horizontal maximum of a vector.
857 */
hmax(vfloat8 a)858 ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
859 {
860 __m128 vlow = _mm256_castps256_ps128(a.m);
861 __m128 vhigh = _mm256_extractf128_ps(a.m, 1);
862 vhigh = _mm_max_ps(vlow, vhigh);
863
864 // First do an horizontal reduction.
865 __m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
866 __m128 maxs = _mm_max_ps(vhigh, shuf);
867 shuf = _mm_movehl_ps(shuf,maxs);
868 maxs = _mm_max_ss(maxs, shuf);
869
870 // This is the most logical implementation, but the convenience intrinsic
871 // is missing on older compilers (supported in g++ 9 and clang++ 9).
872 //__m256i r = _mm256_set_m128(m, m)
873 __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
874 return vfloat8(_mm256_permute_ps(r, 0));
875 }
876
877 /**
878 * @brief Return the horizontal maximum of a vector.
879 */
hmax_s(vfloat8 a)880 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
881 {
882 return hmax(a).lane<0>();
883 }
884
885 /**
886 * @brief Return the horizontal sum of a vector.
887 */
hadd_s(vfloat8 a)888 ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
889 {
890 // Two sequential 4-wide adds gives invariance with 4-wide code
891 vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
892 vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
893 return hadd_s(lo) + hadd_s(hi);
894 }
895
896 /**
897 * @brief Return lanes from @c b if @c cond is set, else @c a.
898 */
select(vfloat8 a,vfloat8 b,vmask8 cond)899 ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
900 {
901 return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
902 }
903
904 /**
905 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
906 */
select_msb(vfloat8 a,vfloat8 b,vmask8 cond)907 ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
908 {
909 return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
910 }
911
912 /**
913 * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
914 *
915 * This is invariant with 4-wide implementations.
916 */
haccumulate(vfloat4 & accum,vfloat8 a)917 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
918 {
919 vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
920 haccumulate(accum, lo);
921
922 vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
923 haccumulate(accum, hi);
924 }
925
926 /**
927 * @brief Accumulate lane-wise sums for a vector.
928 *
929 * This is NOT invariant with 4-wide implementations.
930 */
haccumulate(vfloat8 & accum,vfloat8 a)931 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
932 {
933 accum += a;
934 }
935
936 /**
937 * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
938 *
939 * This is invariant with 4-wide implementations.
940 */
haccumulate(vfloat4 & accum,vfloat8 a,vmask8 m)941 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
942 {
943 a = select(vfloat8::zero(), a, m);
944 haccumulate(accum, a);
945 }
946
947 /**
948 * @brief Accumulate masked lane-wise sums for a vector.
949 *
950 * This is NOT invariant with 4-wide implementations.
951 */
haccumulate(vfloat8 & accum,vfloat8 a,vmask8 m)952 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
953 {
954 a = select(vfloat8::zero(), a, m);
955 haccumulate(accum, a);
956 }
957
958 /**
959 * @brief Return the sqrt of the lanes in the vector.
960 */
sqrt(vfloat8 a)961 ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
962 {
963 return vfloat8(_mm256_sqrt_ps(a.m));
964 }
965
966 /**
967 * @brief Load a vector of gathered results from an array;
968 */
gatherf(const float * base,vint8 indices)969 ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
970 {
971 return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
972 }
973
974 /**
975 * @brief Store a vector to an unaligned memory address.
976 */
store(vfloat8 a,float * p)977 ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
978 {
979 _mm256_storeu_ps(p, a.m);
980 }
981
982 /**
983 * @brief Store a vector to a 32B aligned memory address.
984 */
storea(vfloat8 a,float * p)985 ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
986 {
987 _mm256_store_ps(p, a.m);
988 }
989
990 /**
991 * @brief Return a integer value for a float vector, using truncation.
992 */
float_to_int(vfloat8 a)993 ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
994 {
995 return vint8(_mm256_cvttps_epi32(a.m));
996 }
997
998 /**
999 * @brief Return a integer value for a float vector, using round-to-nearest.
1000 */
float_to_int_rtn(vfloat8 a)1001 ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
1002 {
1003 a = round(a);
1004 return vint8(_mm256_cvttps_epi32(a.m));
1005 }
1006
1007
1008 /**
1009 * @brief Return a float value for an integer vector.
1010 */
int_to_float(vint8 a)1011 ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
1012 {
1013 return vfloat8(_mm256_cvtepi32_ps(a.m));
1014 }
1015
1016 /**
1017 * @brief Return a float value as an integer bit pattern (i.e. no conversion).
1018 *
1019 * It is a common trick to convert floats into integer bit patterns, perform
1020 * some bit hackery based on knowledge they are IEEE 754 layout, and then
1021 * convert them back again. This is the first half of that flip.
1022 */
float_as_int(vfloat8 a)1023 ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
1024 {
1025 return vint8(_mm256_castps_si256(a.m));
1026 }
1027
1028 /**
1029 * @brief Return a integer value as a float bit pattern (i.e. no conversion).
1030 *
1031 * It is a common trick to convert floats into integer bit patterns, perform
1032 * some bit hackery based on knowledge they are IEEE 754 layout, and then
1033 * convert them back again. This is the second half of that flip.
1034 */
int_as_float(vint8 a)1035 ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
1036 {
1037 return vfloat8(_mm256_castsi256_ps(a.m));
1038 }
1039
1040 /**
1041 * @brief Prepare a vtable lookup table for use with the native SIMD size.
1042 */
vtable_prepare(vint4 t0,vint8 & t0p)1043 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
1044 {
1045 // AVX2 duplicates the table within each 128-bit lane
1046 __m128i t0n = t0.m;
1047 t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1048 }
1049
1050 /**
1051 * @brief Prepare a vtable lookup table for use with the native SIMD size.
1052 */
vtable_prepare(vint4 t0,vint4 t1,vint8 & t0p,vint8 & t1p)1053 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
1054 {
1055 // AVX2 duplicates the table within each 128-bit lane
1056 __m128i t0n = t0.m;
1057 t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1058
1059 __m128i t1n = _mm_xor_si128(t0.m, t1.m);
1060 t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
1061 }
1062
1063 /**
1064 * @brief Prepare a vtable lookup table for use with the native SIMD size.
1065 */
vtable_prepare(vint4 t0,vint4 t1,vint4 t2,vint4 t3,vint8 & t0p,vint8 & t1p,vint8 & t2p,vint8 & t3p)1066 ASTCENC_SIMD_INLINE void vtable_prepare(
1067 vint4 t0, vint4 t1, vint4 t2, vint4 t3,
1068 vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
1069 {
1070 // AVX2 duplicates the table within each 128-bit lane
1071 __m128i t0n = t0.m;
1072 t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1073
1074 __m128i t1n = _mm_xor_si128(t0.m, t1.m);
1075 t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
1076
1077 __m128i t2n = _mm_xor_si128(t1.m, t2.m);
1078 t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
1079
1080 __m128i t3n = _mm_xor_si128(t2.m, t3.m);
1081 t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
1082 }
1083
1084 /**
1085 * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
1086 */
vtable_8bt_32bi(vint8 t0,vint8 idx)1087 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
1088 {
1089 // Set index byte MSB to 1 for unused bytes so shuffle returns zero
1090 __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1091
1092 __m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1093 return vint8(result);
1094 }
1095
1096 /**
1097 * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
1098 */
vtable_8bt_32bi(vint8 t0,vint8 t1,vint8 idx)1099 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
1100 {
1101 // Set index byte MSB to 1 for unused bytes so shuffle returns zero
1102 __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1103
1104 __m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1105 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1106
1107 __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
1108 result = _mm256_xor_si256(result, result2);
1109 return vint8(result);
1110 }
1111
1112 /**
1113 * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
1114 */
vtable_8bt_32bi(vint8 t0,vint8 t1,vint8 t2,vint8 t3,vint8 idx)1115 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
1116 {
1117 // Set index byte MSB to 1 for unused bytes so shuffle returns zero
1118 __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1119
1120 __m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1121 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1122
1123 __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
1124 result = _mm256_xor_si256(result, result2);
1125 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1126
1127 result2 = _mm256_shuffle_epi8(t2.m, idxx);
1128 result = _mm256_xor_si256(result, result2);
1129 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1130
1131 result2 = _mm256_shuffle_epi8(t3.m, idxx);
1132 result = _mm256_xor_si256(result, result2);
1133
1134 return vint8(result);
1135 }
1136
1137 /**
1138 * @brief Return a vector of interleaved RGBA data.
1139 *
1140 * Input vectors have the value stored in the bottom 8 bits of each lane,
1141 * with high bits set to zero.
1142 *
1143 * Output vector stores a single RGBA texel packed in each lane.
1144 */
interleave_rgba8(vint8 r,vint8 g,vint8 b,vint8 a)1145 ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
1146 {
1147 return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1148 }
1149
1150 /**
1151 * @brief Store a vector, skipping masked lanes.
1152 *
1153 * All masked lanes must be at the end of vector, after all non-masked lanes.
1154 */
store_lanes_masked(int * base,vint8 data,vmask8 mask)1155 ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint8 data, vmask8 mask)
1156 {
1157 _mm256_maskstore_epi32(base, _mm256_castps_si256(mask.m), data.m);
1158 }
1159
1160 /**
1161 * @brief Debug function to print a vector of ints.
1162 */
print(vint8 a)1163 ASTCENC_SIMD_INLINE void print(vint8 a)
1164 {
1165 alignas(ASTCENC_VECALIGN) int v[8];
1166 storea(a, v);
1167 printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
1168 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1169 }
1170
1171 /**
1172 * @brief Debug function to print a vector of ints.
1173 */
printx(vint8 a)1174 ASTCENC_SIMD_INLINE void printx(vint8 a)
1175 {
1176 alignas(ASTCENC_VECALIGN) int v[8];
1177 storea(a, v);
1178 printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
1179 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1180 }
1181
1182 /**
1183 * @brief Debug function to print a vector of floats.
1184 */
print(vfloat8 a)1185 ASTCENC_SIMD_INLINE void print(vfloat8 a)
1186 {
1187 alignas(ASTCENC_VECALIGN) float v[8];
1188 storea(a, v);
1189 printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
1190 static_cast<double>(v[0]), static_cast<double>(v[1]),
1191 static_cast<double>(v[2]), static_cast<double>(v[3]),
1192 static_cast<double>(v[4]), static_cast<double>(v[5]),
1193 static_cast<double>(v[6]), static_cast<double>(v[7]));
1194 }
1195
1196 /**
1197 * @brief Debug function to print a vector of masks.
1198 */
print(vmask8 a)1199 ASTCENC_SIMD_INLINE void print(vmask8 a)
1200 {
1201 print(select(vint8(0), vint8(1), a));
1202 }
1203
1204 #endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
1205