1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2019-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief 8x32-bit vectors, implemented using AVX2.
20  *
21  * This module implements 8-wide 32-bit float, int, and mask vectors for x86
22  * AVX2.
23  *
24  * There is a baseline level of functionality provided by all vector widths and
25  * implementations. This is implemented using identical function signatures,
26  * modulo data type, so we can use them as substitutable implementations in VLA
27  * code.
28  */
29 
30 #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
31 #define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
32 
33 #ifndef ASTCENC_SIMD_INLINE
34 	#error "Include astcenc_vecmathlib.h, do not include directly"
35 #endif
36 
37 #include <cstdio>
38 
39 // Define convenience intrinsics that are missing on older compilers
40 #define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
41 
42 // ============================================================================
43 // vfloat8 data type
44 // ============================================================================
45 
46 /**
47  * @brief Data type for 8-wide floats.
48  */
49 struct vfloat8
50 {
51 	/**
52 	 * @brief Construct from zero-initialized value.
53 	 */
54 	ASTCENC_SIMD_INLINE vfloat8() = default;
55 
56 	/**
57 	 * @brief Construct from 4 values loaded from an unaligned address.
58 	 *
59 	 * Consider using loada() which is better with vectors if data is aligned
60 	 * to vector length.
61 	 */
vfloat8vfloat862 	ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
63 	{
64 		m = _mm256_loadu_ps(p);
65 	}
66 
67 	/**
68 	 * @brief Construct from 1 scalar value replicated across all lanes.
69 	 *
70 	 * Consider using zero() for constexpr zeros.
71 	 */
vfloat8vfloat872 	ASTCENC_SIMD_INLINE explicit vfloat8(float a)
73 	{
74 		m = _mm256_set1_ps(a);
75 	}
76 
77 	/**
78 	 * @brief Construct from 8 scalar values.
79 	 *
80 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
81 	 */
vfloat8vfloat882 	ASTCENC_SIMD_INLINE explicit vfloat8(
83 		float a, float b, float c, float d,
84 		float e, float f, float g, float h)
85 	{
86 		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
87 	}
88 
89 	/**
90 	 * @brief Construct from an existing SIMD register.
91 	 */
vfloat8vfloat892 	ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
93 	{
94 		m = a;
95 	}
96 
97 	/**
98 	 * @brief Get the scalar value of a single lane.
99 	 */
lanevfloat8100 	template <int l> ASTCENC_SIMD_INLINE float lane() const
101 	{
102 	#if !defined(__clang__) && defined(_MSC_VER)
103 		return m.m256_f32[l];
104 	#else
105 		union { __m256 m; float f[8]; } cvt;
106 		cvt.m = m;
107 		return cvt.f[l];
108 	#endif
109 	}
110 
111 	/**
112 	 * @brief Factory that returns a vector of zeros.
113 	 */
zerovfloat8114 	static ASTCENC_SIMD_INLINE vfloat8 zero()
115 	{
116 		return vfloat8(_mm256_setzero_ps());
117 	}
118 
119 	/**
120 	 * @brief Factory that returns a replicated scalar loaded from memory.
121 	 */
load1vfloat8122 	static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
123 	{
124 		return vfloat8(_mm256_broadcast_ss(p));
125 	}
126 
127 	/**
128 	 * @brief Factory that returns a vector loaded from 32B aligned memory.
129 	 */
loadavfloat8130 	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
131 	{
132 		return vfloat8(_mm256_load_ps(p));
133 	}
134 
135 	/**
136 	 * @brief Factory that returns a vector containing the lane IDs.
137 	 */
lane_idvfloat8138 	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
139 	{
140 		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
141 	}
142 
143 	/**
144 	 * @brief The vector ...
145 	 */
146 	__m256 m;
147 };
148 
149 // ============================================================================
150 // vint8 data type
151 // ============================================================================
152 
153 /**
154  * @brief Data type for 8-wide ints.
155  */
156 struct vint8
157 {
158 	/**
159 	 * @brief Construct from zero-initialized value.
160 	 */
161 	ASTCENC_SIMD_INLINE vint8() = default;
162 
163 	/**
164 	 * @brief Construct from 8 values loaded from an unaligned address.
165 	 *
166 	 * Consider using loada() which is better with vectors if data is aligned
167 	 * to vector length.
168 	 */
vint8vint8169 	ASTCENC_SIMD_INLINE explicit vint8(const int *p)
170 	{
171 		m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
172 	}
173 
174 	/**
175 	 * @brief Construct from 8 uint8_t loaded from an unaligned address.
176 	 */
vint8vint8177 	ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
178 	{
179 		// _mm_loadu_si64 would be nicer syntax, but missing on older GCC
180 		m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
181 	}
182 
183 	/**
184 	 * @brief Construct from 1 scalar value replicated across all lanes.
185 	 *
186 	 * Consider using vfloat4::zero() for constexpr zeros.
187 	 */
vint8vint8188 	ASTCENC_SIMD_INLINE explicit vint8(int a)
189 	{
190 		m = _mm256_set1_epi32(a);
191 	}
192 
193 	/**
194 	 * @brief Construct from 8 scalar values.
195 	 *
196 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
197 	 */
vint8vint8198 	ASTCENC_SIMD_INLINE explicit vint8(
199 		int a, int b, int c, int d,
200 		int e, int f, int g, int h)
201 	{
202 		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
203 	}
204 
205 	/**
206 	 * @brief Construct from an existing SIMD register.
207 	 */
vint8vint8208 	ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
209 	{
210 		m = a;
211 	}
212 
213 	/**
214 	 * @brief Get the scalar from a single lane.
215 	 */
lanevint8216 	template <int l> ASTCENC_SIMD_INLINE int lane() const
217 	{
218 	#if !defined(__clang__) && defined(_MSC_VER)
219 		return m.m256i_i32[l];
220 	#else
221 		union { __m256i m; int f[8]; } cvt;
222 		cvt.m = m;
223 		return cvt.f[l];
224 	#endif
225 	}
226 
227 	/**
228 	 * @brief Factory that returns a vector of zeros.
229 	 */
zerovint8230 	static ASTCENC_SIMD_INLINE vint8 zero()
231 	{
232 		return vint8(_mm256_setzero_si256());
233 	}
234 
235 	/**
236 	 * @brief Factory that returns a replicated scalar loaded from memory.
237 	 */
load1vint8238 	static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
239 	{
240 		__m128i a = _mm_set1_epi32(*p);
241 		return vint8(_mm256_broadcastd_epi32(a));
242 	}
243 
244 	/**
245 	 * @brief Factory that returns a vector loaded from 32B aligned memory.
246 	 */
loadavint8247 	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
248 	{
249 		return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
250 	}
251 
252 	/**
253 	 * @brief Factory that returns a vector containing the lane IDs.
254 	 */
lane_idvint8255 	static ASTCENC_SIMD_INLINE vint8 lane_id()
256 	{
257 		return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
258 	}
259 
260 	/**
261 	 * @brief The vector ...
262 	 */
263 	__m256i m;
264 };
265 
266 // ============================================================================
267 // vmask8 data type
268 // ============================================================================
269 
270 /**
271  * @brief Data type for 8-wide control plane masks.
272  */
273 struct vmask8
274 {
275 	/**
276 	 * @brief Construct from an existing SIMD register.
277 	 */
vmask8vmask8278 	ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
279 	{
280 		m = a;
281 	}
282 
283 	/**
284 	 * @brief Construct from an existing SIMD register.
285 	 */
vmask8vmask8286 	ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
287 	{
288 		m = _mm256_castsi256_ps(a);
289 	}
290 
291 	/**
292 	 * @brief Construct from 1 scalar value.
293 	 */
vmask8vmask8294 	ASTCENC_SIMD_INLINE explicit vmask8(bool a)
295 	{
296 		vint8 mask(a == false ? 0 : -1);
297 		m = _mm256_castsi256_ps(mask.m);
298 	}
299 
300 	/**
301 	 * @brief The vector ...
302 	 */
303 	__m256 m;
304 };
305 
306 // ============================================================================
307 // vmask8 operators and functions
308 // ============================================================================
309 
310 /**
311  * @brief Overload: mask union (or).
312  */
313 ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
314 {
315 	return vmask8(_mm256_or_ps(a.m, b.m));
316 }
317 
318 /**
319  * @brief Overload: mask intersect (and).
320  */
321 ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
322 {
323 	return vmask8(_mm256_and_ps(a.m, b.m));
324 }
325 
326 /**
327  * @brief Overload: mask difference (xor).
328  */
329 ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
330 {
331 	return vmask8(_mm256_xor_ps(a.m, b.m));
332 }
333 
334 /**
335  * @brief Overload: mask invert (not).
336  */
337 ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
338 {
339 	return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
340 }
341 
342 /**
343  * @brief Return a 8-bit mask code indicating mask status.
344  *
345  * bit0 = lane 0
346  */
mask(vmask8 a)347 ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
348 {
349 	return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
350 }
351 
352 /**
353  * @brief True if any lanes are enabled, false otherwise.
354  */
any(vmask8 a)355 ASTCENC_SIMD_INLINE bool any(vmask8 a)
356 {
357 	return mask(a) != 0;
358 }
359 
360 /**
361  * @brief True if all lanes are enabled, false otherwise.
362  */
all(vmask8 a)363 ASTCENC_SIMD_INLINE bool all(vmask8 a)
364 {
365 	return mask(a) == 0xFF;
366 }
367 
368 // ============================================================================
369 // vint8 operators and functions
370 // ============================================================================
371 /**
372  * @brief Overload: vector by vector addition.
373  */
374 ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
375 {
376 	return vint8(_mm256_add_epi32(a.m, b.m));
377 }
378 
379 /**
380  * @brief Overload: vector by vector incremental addition.
381  */
382 ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
383 {
384 	a = a + b;
385 	return a;
386 }
387 
388 /**
389  * @brief Overload: vector by vector subtraction.
390  */
391 ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
392 {
393 	return vint8(_mm256_sub_epi32(a.m, b.m));
394 }
395 
396 /**
397  * @brief Overload: vector by vector multiplication.
398  */
399 ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
400 {
401 	return vint8(_mm256_mullo_epi32(a.m, b.m));
402 }
403 
404 /**
405  * @brief Overload: vector bit invert.
406  */
407 ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
408 {
409 	return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
410 }
411 
412 /**
413  * @brief Overload: vector by vector bitwise or.
414  */
415 ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
416 {
417 	return vint8(_mm256_or_si256(a.m, b.m));
418 }
419 
420 /**
421  * @brief Overload: vector by vector bitwise and.
422  */
423 ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
424 {
425 	return vint8(_mm256_and_si256(a.m, b.m));
426 }
427 
428 /**
429  * @brief Overload: vector by vector bitwise xor.
430  */
431 ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
432 {
433 	return vint8(_mm256_xor_si256(a.m, b.m));
434 }
435 
436 /**
437  * @brief Overload: vector by vector equality.
438  */
439 ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
440 {
441 	return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
442 }
443 
444 /**
445  * @brief Overload: vector by vector inequality.
446  */
447 ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
448 {
449 	return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
450 }
451 
452 /**
453  * @brief Overload: vector by vector less than.
454  */
455 ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
456 {
457 	return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
458 }
459 
460 /**
461  * @brief Overload: vector by vector greater than.
462  */
463 ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
464 {
465 	return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
466 }
467 
468 /**
469  * @brief Logical shift left.
470  */
lsl(vint8 a)471 template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
472 {
473 	return vint8(_mm256_slli_epi32(a.m, s));
474 }
475 
476 /**
477  * @brief Arithmetic shift right.
478  */
asr(vint8 a)479 template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
480 {
481 	return vint8(_mm256_srai_epi32(a.m, s));
482 }
483 
484 /**
485  * @brief Logical shift right.
486  */
lsr(vint8 a)487 template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
488 {
489 	return vint8(_mm256_srli_epi32(a.m, s));
490 }
491 
492 /**
493  * @brief Return the min vector of two vectors.
494  */
min(vint8 a,vint8 b)495 ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
496 {
497 	return vint8(_mm256_min_epi32(a.m, b.m));
498 }
499 
500 /**
501  * @brief Return the max vector of two vectors.
502  */
max(vint8 a,vint8 b)503 ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
504 {
505 	return vint8(_mm256_max_epi32(a.m, b.m));
506 }
507 
508 /**
509  * @brief Return the horizontal minimum of a vector.
510  */
hmin(vint8 a)511 ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
512 {
513 	__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
514 	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
515 	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
516 	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
517 
518 	__m256i r = astcenc_mm256_set_m128i(m, m);
519 	vint8 vmin(r);
520 	return vmin;
521 }
522 
523 /**
524  * @brief Return the horizontal maximum of a vector.
525  */
hmax(vint8 a)526 ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
527 {
528 	__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
529 	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
530 	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
531 	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
532 
533 	__m256i r = astcenc_mm256_set_m128i(m, m);
534 	vint8 vmax(r);
535 	return vmax;
536 }
537 
538 /**
539  * @brief Store a vector to a 16B aligned memory address.
540  */
storea(vint8 a,int * p)541 ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
542 {
543 	_mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
544 }
545 
546 /**
547  * @brief Store a vector to an unaligned memory address.
548  */
store(vint8 a,int * p)549 ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
550 {
551 	_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
552 }
553 
554 /**
555  * @brief Store lowest N (vector width) bytes into an unaligned address.
556  */
store_nbytes(vint8 a,uint8_t * p)557 ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
558 {
559 	// This is the most logical implementation, but the convenience intrinsic
560 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
561 	// _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
562 	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
563 }
564 
565 /**
566  * @brief Gather N (vector width) indices from the array.
567  */
gatheri(const int * base,vint8 indices)568 ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
569 {
570 	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
571 }
572 
573 /**
574  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
575  */
pack_low_bytes(vint8 v)576 ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
577 {
578 	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
579 	                               0, 0, 0, 0, 28, 24, 20, 16,
580 	                               0, 0, 0, 0,  0,  0,  0,  0,
581 	                               0, 0, 0, 0, 12,  8,  4,  0);
582 	__m256i a = _mm256_shuffle_epi8(v.m, shuf);
583 	__m128i a0 = _mm256_extracti128_si256(a, 0);
584 	__m128i a1 = _mm256_extracti128_si256(a, 1);
585 	__m128i b = _mm_unpacklo_epi32(a0, a1);
586 
587 	__m256i r = astcenc_mm256_set_m128i(b, b);
588 	return vint8(r);
589 }
590 
591 /**
592  * @brief Return lanes from @c b if @c cond is set, else @c a.
593  */
select(vint8 a,vint8 b,vmask8 cond)594 ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
595 {
596 	__m256i condi = _mm256_castps_si256(cond.m);
597 	return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
598 }
599 
600 // ============================================================================
601 // vfloat4 operators and functions
602 // ============================================================================
603 
604 /**
605  * @brief Overload: vector by vector addition.
606  */
607 ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
608 {
609 	return vfloat8(_mm256_add_ps(a.m, b.m));
610 }
611 
612 /**
613  * @brief Overload: vector by vector incremental addition.
614  */
615 ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
616 {
617 	a = a + b;
618 	return a;
619 }
620 
621 /**
622  * @brief Overload: vector by vector subtraction.
623  */
624 ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
625 {
626 	return vfloat8(_mm256_sub_ps(a.m, b.m));
627 }
628 
629 /**
630  * @brief Overload: vector by vector multiplication.
631  */
632 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
633 {
634 	return vfloat8(_mm256_mul_ps(a.m, b.m));
635 }
636 
637 /**
638  * @brief Overload: vector by scalar multiplication.
639  */
640 ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
641 {
642 	return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
643 }
644 
645 /**
646  * @brief Overload: scalar by vector multiplication.
647  */
648 ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
649 {
650 	return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
651 }
652 
653 /**
654  * @brief Overload: vector by vector division.
655  */
656 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
657 {
658 	return vfloat8(_mm256_div_ps(a.m, b.m));
659 }
660 
661 /**
662  * @brief Overload: vector by scalar division.
663  */
664 ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
665 {
666 	return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
667 }
668 
669 
670 /**
671  * @brief Overload: scalar by vector division.
672  */
673 ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
674 {
675 	return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
676 }
677 
678 
679 /**
680  * @brief Overload: vector by vector equality.
681  */
682 ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
683 {
684 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
685 }
686 
687 /**
688  * @brief Overload: vector by vector inequality.
689  */
690 ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
691 {
692 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
693 }
694 
695 /**
696  * @brief Overload: vector by vector less than.
697  */
698 ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
699 {
700 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
701 }
702 
703 /**
704  * @brief Overload: vector by vector greater than.
705  */
706 ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
707 {
708 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
709 }
710 
711 /**
712  * @brief Overload: vector by vector less than or equal.
713  */
714 ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
715 {
716 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
717 }
718 
719 /**
720  * @brief Overload: vector by vector greater than or equal.
721  */
722 ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
723 {
724 	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
725 }
726 
727 /**
728  * @brief Return the min vector of two vectors.
729  *
730  * If either lane value is NaN, @c b will be returned for that lane.
731  */
min(vfloat8 a,vfloat8 b)732 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
733 {
734 	return vfloat8(_mm256_min_ps(a.m, b.m));
735 }
736 
737 /**
738  * @brief Return the min vector of a vector and a scalar.
739  *
740  * If either lane value is NaN, @c b will be returned for that lane.
741  */
min(vfloat8 a,float b)742 ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
743 {
744 	return min(a, vfloat8(b));
745 }
746 
747 /**
748  * @brief Return the max vector of two vectors.
749  *
750  * If either lane value is NaN, @c b will be returned for that lane.
751  */
max(vfloat8 a,vfloat8 b)752 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
753 {
754 	return vfloat8(_mm256_max_ps(a.m, b.m));
755 }
756 
757 /**
758  * @brief Return the max vector of a vector and a scalar.
759  *
760  * If either lane value is NaN, @c b will be returned for that lane.
761  */
max(vfloat8 a,float b)762 ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
763 {
764 	return max(a, vfloat8(b));
765 }
766 
767 /**
768  * @brief Return the clamped value between min and max.
769  *
770  * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
771  * then @c min will be returned for that lane.
772  */
clamp(float min,float max,vfloat8 a)773 ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
774 {
775 	// Do not reorder - second operand will return if either is NaN
776 	a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
777 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
778 	return a;
779 }
780 
781 /**
782  * @brief Return a clamped value between 0.0f and max.
783  *
784  * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
785  * be returned for that lane.
786  */
clampz(float max,vfloat8 a)787 ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
788 {
789 	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
790 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
791 	return a;
792 }
793 
794 /**
795  * @brief Return a clamped value between 0.0f and 1.0f.
796  *
797  * If @c a is NaN then zero will be returned for that lane.
798  */
clampzo(vfloat8 a)799 ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
800 {
801 	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
802 	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
803 	return a;
804 }
805 
806 /**
807  * @brief Return the absolute value of the float vector.
808  */
abs(vfloat8 a)809 ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
810 {
811 	__m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
812 	return vfloat8(_mm256_and_ps(a.m, msk));
813 }
814 
815 /**
816  * @brief Return a float rounded to the nearest integer value.
817  */
round(vfloat8 a)818 ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
819 {
820 	constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
821 	return vfloat8(_mm256_round_ps(a.m, flags));
822 }
823 
824 /**
825  * @brief Return the horizontal minimum of a vector.
826  */
hmin(vfloat8 a)827 ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
828 {
829 	__m128 vlow = _mm256_castps256_ps128(a.m);
830 	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
831 	vlow = _mm_min_ps(vlow, vhigh);
832 
833 	// First do an horizontal reduction.
834 	__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
835 	__m128 mins = _mm_min_ps(vlow, shuf);
836 	shuf = _mm_movehl_ps(shuf, mins);
837 	mins = _mm_min_ss(mins, shuf);
838 
839 	// This is the most logical implementation, but the convenience intrinsic
840 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
841 	//__m256i r = _mm256_set_m128(m, m)
842 	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
843 
844 	return vfloat8(_mm256_permute_ps(r, 0));
845 }
846 
847 /**
848  * @brief Return the horizontal minimum of a vector.
849  */
hmin_s(vfloat8 a)850 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
851 {
852 	return hmin(a).lane<0>();
853 }
854 
855 /**
856  * @brief Return the horizontal maximum of a vector.
857  */
hmax(vfloat8 a)858 ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
859 {
860 	__m128 vlow = _mm256_castps256_ps128(a.m);
861 	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
862 	vhigh = _mm_max_ps(vlow, vhigh);
863 
864 	// First do an horizontal reduction.
865 	__m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
866 	__m128 maxs = _mm_max_ps(vhigh, shuf);
867 	shuf = _mm_movehl_ps(shuf,maxs);
868 	maxs = _mm_max_ss(maxs, shuf);
869 
870 	// This is the most logical implementation, but the convenience intrinsic
871 	// is missing on older compilers (supported in g++ 9 and clang++ 9).
872 	//__m256i r = _mm256_set_m128(m, m)
873 	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
874 	return vfloat8(_mm256_permute_ps(r, 0));
875 }
876 
877 /**
878  * @brief Return the horizontal maximum of a vector.
879  */
hmax_s(vfloat8 a)880 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
881 {
882 	return hmax(a).lane<0>();
883 }
884 
885 /**
886  * @brief Return the horizontal sum of a vector.
887  */
hadd_s(vfloat8 a)888 ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
889 {
890 	// Two sequential 4-wide adds gives invariance with 4-wide code
891 	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
892 	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
893 	return hadd_s(lo) + hadd_s(hi);
894 }
895 
896 /**
897  * @brief Return lanes from @c b if @c cond is set, else @c a.
898  */
select(vfloat8 a,vfloat8 b,vmask8 cond)899 ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
900 {
901 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
902 }
903 
904 /**
905  * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
906  */
select_msb(vfloat8 a,vfloat8 b,vmask8 cond)907 ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
908 {
909 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
910 }
911 
912 /**
913  * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
914  *
915  * This is invariant with 4-wide implementations.
916  */
haccumulate(vfloat4 & accum,vfloat8 a)917 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
918 {
919 	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
920 	haccumulate(accum, lo);
921 
922 	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
923 	haccumulate(accum, hi);
924 }
925 
926 /**
927  * @brief Accumulate lane-wise sums for a vector.
928  *
929  * This is NOT invariant with 4-wide implementations.
930  */
haccumulate(vfloat8 & accum,vfloat8 a)931 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
932 {
933 	accum += a;
934 }
935 
936 /**
937  * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
938  *
939  * This is invariant with 4-wide implementations.
940  */
haccumulate(vfloat4 & accum,vfloat8 a,vmask8 m)941 ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
942 {
943 	a = select(vfloat8::zero(), a, m);
944 	haccumulate(accum, a);
945 }
946 
947 /**
948  * @brief Accumulate masked lane-wise sums for a vector.
949  *
950  * This is NOT invariant with 4-wide implementations.
951  */
haccumulate(vfloat8 & accum,vfloat8 a,vmask8 m)952 ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
953 {
954 	a = select(vfloat8::zero(), a, m);
955 	haccumulate(accum, a);
956 }
957 
958 /**
959  * @brief Return the sqrt of the lanes in the vector.
960  */
sqrt(vfloat8 a)961 ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
962 {
963 	return vfloat8(_mm256_sqrt_ps(a.m));
964 }
965 
966 /**
967  * @brief Load a vector of gathered results from an array;
968  */
gatherf(const float * base,vint8 indices)969 ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
970 {
971 	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
972 }
973 
974 /**
975  * @brief Store a vector to an unaligned memory address.
976  */
store(vfloat8 a,float * p)977 ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
978 {
979 	_mm256_storeu_ps(p, a.m);
980 }
981 
982 /**
983  * @brief Store a vector to a 32B aligned memory address.
984  */
storea(vfloat8 a,float * p)985 ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
986 {
987 	_mm256_store_ps(p, a.m);
988 }
989 
990 /**
991  * @brief Return a integer value for a float vector, using truncation.
992  */
float_to_int(vfloat8 a)993 ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
994 {
995 	return vint8(_mm256_cvttps_epi32(a.m));
996 }
997 
998 /**
999  * @brief Return a integer value for a float vector, using round-to-nearest.
1000  */
float_to_int_rtn(vfloat8 a)1001 ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
1002 {
1003 	a = round(a);
1004 	return vint8(_mm256_cvttps_epi32(a.m));
1005 }
1006 
1007 
1008 /**
1009  * @brief Return a float value for an integer vector.
1010  */
int_to_float(vint8 a)1011 ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
1012 {
1013 	return vfloat8(_mm256_cvtepi32_ps(a.m));
1014 }
1015 
1016 /**
1017  * @brief Return a float value as an integer bit pattern (i.e. no conversion).
1018  *
1019  * It is a common trick to convert floats into integer bit patterns, perform
1020  * some bit hackery based on knowledge they are IEEE 754 layout, and then
1021  * convert them back again. This is the first half of that flip.
1022  */
float_as_int(vfloat8 a)1023 ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
1024 {
1025 	return vint8(_mm256_castps_si256(a.m));
1026 }
1027 
1028 /**
1029  * @brief Return a integer value as a float bit pattern (i.e. no conversion).
1030  *
1031  * It is a common trick to convert floats into integer bit patterns, perform
1032  * some bit hackery based on knowledge they are IEEE 754 layout, and then
1033  * convert them back again. This is the second half of that flip.
1034  */
int_as_float(vint8 a)1035 ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
1036 {
1037 	return vfloat8(_mm256_castsi256_ps(a.m));
1038 }
1039 
1040 /**
1041  * @brief Prepare a vtable lookup table for use with the native SIMD size.
1042  */
vtable_prepare(vint4 t0,vint8 & t0p)1043 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
1044 {
1045 	// AVX2 duplicates the table within each 128-bit lane
1046 	__m128i t0n = t0.m;
1047 	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1048 }
1049 
1050 /**
1051  * @brief Prepare a vtable lookup table for use with the native SIMD size.
1052  */
vtable_prepare(vint4 t0,vint4 t1,vint8 & t0p,vint8 & t1p)1053 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
1054 {
1055 	// AVX2 duplicates the table within each 128-bit lane
1056 	__m128i t0n = t0.m;
1057 	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1058 
1059 	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
1060 	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
1061 }
1062 
1063 /**
1064  * @brief Prepare a vtable lookup table for use with the native SIMD size.
1065  */
vtable_prepare(vint4 t0,vint4 t1,vint4 t2,vint4 t3,vint8 & t0p,vint8 & t1p,vint8 & t2p,vint8 & t3p)1066 ASTCENC_SIMD_INLINE void vtable_prepare(
1067 	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
1068 	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
1069 {
1070 	// AVX2 duplicates the table within each 128-bit lane
1071 	__m128i t0n = t0.m;
1072 	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
1073 
1074 	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
1075 	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
1076 
1077 	__m128i t2n = _mm_xor_si128(t1.m, t2.m);
1078 	t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
1079 
1080 	__m128i t3n = _mm_xor_si128(t2.m, t3.m);
1081 	t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
1082 }
1083 
1084 /**
1085  * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
1086  */
vtable_8bt_32bi(vint8 t0,vint8 idx)1087 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
1088 {
1089 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1090 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1091 
1092 	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1093 	return vint8(result);
1094 }
1095 
1096 /**
1097  * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
1098  */
vtable_8bt_32bi(vint8 t0,vint8 t1,vint8 idx)1099 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
1100 {
1101 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1102 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1103 
1104 	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1105 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1106 
1107 	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
1108 	result = _mm256_xor_si256(result, result2);
1109 	return vint8(result);
1110 }
1111 
1112 /**
1113  * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
1114  */
vtable_8bt_32bi(vint8 t0,vint8 t1,vint8 t2,vint8 t3,vint8 idx)1115 ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
1116 {
1117 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
1118 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
1119 
1120 	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
1121 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1122 
1123 	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
1124 	result = _mm256_xor_si256(result, result2);
1125 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1126 
1127 	result2 = _mm256_shuffle_epi8(t2.m, idxx);
1128 	result = _mm256_xor_si256(result, result2);
1129 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
1130 
1131 	result2 = _mm256_shuffle_epi8(t3.m, idxx);
1132 	result = _mm256_xor_si256(result, result2);
1133 
1134 	return vint8(result);
1135 }
1136 
1137 /**
1138  * @brief Return a vector of interleaved RGBA data.
1139  *
1140  * Input vectors have the value stored in the bottom 8 bits of each lane,
1141  * with high  bits set to zero.
1142  *
1143  * Output vector stores a single RGBA texel packed in each lane.
1144  */
interleave_rgba8(vint8 r,vint8 g,vint8 b,vint8 a)1145 ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
1146 {
1147 	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1148 }
1149 
1150 /**
1151  * @brief Store a vector, skipping masked lanes.
1152  *
1153  * All masked lanes must be at the end of vector, after all non-masked lanes.
1154  */
store_lanes_masked(int * base,vint8 data,vmask8 mask)1155 ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint8 data, vmask8 mask)
1156 {
1157 	_mm256_maskstore_epi32(base, _mm256_castps_si256(mask.m), data.m);
1158 }
1159 
1160 /**
1161  * @brief Debug function to print a vector of ints.
1162  */
print(vint8 a)1163 ASTCENC_SIMD_INLINE void print(vint8 a)
1164 {
1165 	alignas(ASTCENC_VECALIGN) int v[8];
1166 	storea(a, v);
1167 	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
1168 	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1169 }
1170 
1171 /**
1172  * @brief Debug function to print a vector of ints.
1173  */
printx(vint8 a)1174 ASTCENC_SIMD_INLINE void printx(vint8 a)
1175 {
1176 	alignas(ASTCENC_VECALIGN) int v[8];
1177 	storea(a, v);
1178 	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
1179 	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
1180 }
1181 
1182 /**
1183  * @brief Debug function to print a vector of floats.
1184  */
print(vfloat8 a)1185 ASTCENC_SIMD_INLINE void print(vfloat8 a)
1186 {
1187 	alignas(ASTCENC_VECALIGN) float v[8];
1188 	storea(a, v);
1189 	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
1190 	       static_cast<double>(v[0]), static_cast<double>(v[1]),
1191 	       static_cast<double>(v[2]), static_cast<double>(v[3]),
1192 	       static_cast<double>(v[4]), static_cast<double>(v[5]),
1193 	       static_cast<double>(v[6]), static_cast<double>(v[7]));
1194 }
1195 
1196 /**
1197  * @brief Debug function to print a vector of masks.
1198  */
print(vmask8 a)1199 ASTCENC_SIMD_INLINE void print(vmask8 a)
1200 {
1201 	print(select(vint8(0), vint8(1), a));
1202 }
1203 
1204 #endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
1205