1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2019-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
20  *
21  * This module implements 4-wide 32-bit float, int, and mask vectors for
22  * Armv8-A NEON.
23  *
24  * There is a baseline level of functionality provided by all vector widths and
25  * implementations. This is implemented using identical function signatures,
26  * modulo data type, so we can use them as substitutable implementations in VLA
27  * code.
28  *
29  * The 4-wide vectors are also used as a fixed-width type, and significantly
30  * extend the functionality above that available to VLA code.
31  */
32 
33 #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
34 #define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
35 
36 #ifndef ASTCENC_SIMD_INLINE
37 	#error "Include astcenc_vecmathlib.h, do not include directly"
38 #endif
39 
40 #include <cstdio>
41 
42 // ============================================================================
43 // vfloat4 data type
44 // ============================================================================
45 
46 /**
47  * @brief Data type for 4-wide floats.
48  */
49 struct vfloat4
50 {
51 	/**
52 	 * @brief Construct from zero-initialized value.
53 	 */
54 	ASTCENC_SIMD_INLINE vfloat4() = default;
55 
56 	/**
57 	 * @brief Construct from 4 values loaded from an unaligned address.
58 	 *
59 	 * Consider using loada() which is better with vectors if data is aligned
60 	 * to vector length.
61 	 */
vfloat4vfloat462 	ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
63 	{
64 		m = vld1q_f32(p);
65 	}
66 
67 	/**
68 	 * @brief Construct from 1 scalar value replicated across all lanes.
69 	 *
70 	 * Consider using zero() for constexpr zeros.
71 	 */
vfloat4vfloat472 	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
73 	{
74 		m = vdupq_n_f32(a);
75 	}
76 
77 	/**
78 	 * @brief Construct from 4 scalar values.
79 	 *
80 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
81 	 */
vfloat4vfloat482 	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
83 	{
84 		float v[4] { a, b, c, d };
85 		m = vld1q_f32(v);
86 	}
87 
88 	/**
89 	 * @brief Construct from an existing SIMD register.
90 	 */
vfloat4vfloat491 	ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
92 	{
93 		m = a;
94 	}
95 
96 	/**
97 	 * @brief Get the scalar value of a single lane.
98 	 */
lanevfloat499 	template <int l> ASTCENC_SIMD_INLINE float lane() const
100 	{
101 		return vgetq_lane_f32(m, l);
102 	}
103 
104 	/**
105 	 * @brief Set the scalar value of a single lane.
106 	 */
set_lanevfloat4107 	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
108 	{
109 		m = vld1q_lane_f32(&a, m, l);
110 	}
111 
112 	/**
113 	 * @brief Factory that returns a vector of zeros.
114 	 */
zerovfloat4115 	static ASTCENC_SIMD_INLINE vfloat4 zero()
116 	{
117 		return vfloat4(vdupq_n_f32(0.0f));
118 	}
119 
120 	/**
121 	 * @brief Factory that returns a replicated scalar loaded from memory.
122 	 */
load1vfloat4123 	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
124 	{
125 		return vfloat4(vdupq_n_f32(*p));
126 	}
127 
128 	/**
129 	 * @brief Factory that returns a vector loaded from 16B aligned memory.
130 	 */
loadavfloat4131 	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
132 	{
133 		return vfloat4(vld1q_f32(p));
134 	}
135 
136 	/**
137 	 * @brief Factory that returns a vector containing the lane IDs.
138 	 */
lane_idvfloat4139 	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
140 	{
141 		alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
142 		return vfloat4(vld1q_f32(data));
143 	}
144 
145 	/**
146 	 * @brief Return a swizzled float 2.
147 	 */
swzvfloat4148 	template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
149 	{
150 		return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
151 	}
152 
153 	/**
154 	 * @brief Return a swizzled float 3.
155 	 */
swzvfloat4156 	template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
157 	{
158 		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
159 	}
160 
161 	/**
162 	 * @brief Return a swizzled float 4.
163 	 */
swzvfloat4164 	template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
165 	{
166 		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
167 	}
168 
169 	/**
170 	 * @brief The vector ...
171 	 */
172 	float32x4_t m;
173 };
174 
175 // ============================================================================
176 // vint4 data type
177 // ============================================================================
178 
179 /**
180  * @brief Data type for 4-wide ints.
181  */
182 struct vint4
183 {
184 	/**
185 	 * @brief Construct from zero-initialized value.
186 	 */
187 	ASTCENC_SIMD_INLINE vint4() = default;
188 
189 	/**
190 	 * @brief Construct from 4 values loaded from an unaligned address.
191 	 *
192 	 * Consider using loada() which is better with vectors if data is aligned
193 	 * to vector length.
194 	 */
vint4vint4195 	ASTCENC_SIMD_INLINE explicit vint4(const int *p)
196 	{
197 		m = vld1q_s32(p);
198 	}
199 
200 	/**
201 	 * @brief Construct from 4 uint8_t loaded from an unaligned address.
202 	 */
vint4vint4203 	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
204 	{
205 		uint32x2_t t8 {};
206 		// Cast is safe - NEON loads are allowed to be unaligned
207 		t8 = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), t8, 0);
208 		uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
209 		m = vreinterpretq_s32_u32(vmovl_u16(t16));
210 	}
211 
212 	/**
213 	 * @brief Construct from 1 scalar value replicated across all lanes.
214 	 *
215 	 * Consider using vfloat4::zero() for constexpr zeros.
216 	 */
vint4vint4217 	ASTCENC_SIMD_INLINE explicit vint4(int a)
218 	{
219 		m = vdupq_n_s32(a);
220 	}
221 
222 	/**
223 	 * @brief Construct from 4 scalar values.
224 	 *
225 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
226 	 */
vint4vint4227 	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
228 	{
229 		int v[4] { a, b, c, d };
230 		m = vld1q_s32(v);
231 	}
232 
233 	/**
234 	 * @brief Construct from an existing SIMD register.
235 	 */
vint4vint4236 	ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
237 	{
238 		m = a;
239 	}
240 
241 	/**
242 	 * @brief Get the scalar from a single lane.
243 	 */
lanevint4244 	template <int l> ASTCENC_SIMD_INLINE int lane() const
245 	{
246 		return vgetq_lane_s32(m, l);
247 	}
248 
249 	/**
250 	 * @brief Set the scalar value of a single lane.
251 	 */
set_lanevint4252 	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
253 	{
254 		m = vld1q_lane_s32(&a, m, l);
255 	}
256 
257 	/**
258 	 * @brief Factory that returns a vector of zeros.
259 	 */
zerovint4260 	static ASTCENC_SIMD_INLINE vint4 zero()
261 	{
262 		return vint4(0);
263 	}
264 
265 	/**
266 	 * @brief Factory that returns a replicated scalar loaded from memory.
267 	 */
load1vint4268 	static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
269 	{
270 		return vint4(*p);
271 	}
272 
273 	/**
274 	 * @brief Factory that returns a vector loaded from 16B aligned memory.
275 	 */
loadavint4276 	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
277 	{
278 		return vint4(p);
279 	}
280 
281 	/**
282 	 * @brief Factory that returns a vector containing the lane IDs.
283 	 */
lane_idvint4284 	static ASTCENC_SIMD_INLINE vint4 lane_id()
285 	{
286 		alignas(16) static const int data[4] { 0, 1, 2, 3 };
287 		return vint4(vld1q_s32(data));
288 	}
289 
290 	/**
291 	 * @brief The vector ...
292 	 */
293 	int32x4_t m;
294 };
295 
296 // ============================================================================
297 // vmask4 data type
298 // ============================================================================
299 
300 /**
301  * @brief Data type for 4-wide control plane masks.
302  */
303 struct vmask4
304 {
305 	/**
306 	 * @brief Construct from an existing SIMD register.
307 	 */
vmask4vmask4308 	ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
309 	{
310 		m = a;
311 	}
312 
313 #if !defined(_MSC_VER)
314 	/**
315 	 * @brief Construct from an existing SIMD register.
316 	 */
vmask4vmask4317 	ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
318 	{
319 		m = vreinterpretq_u32_s32(a);
320 	}
321 #endif
322 
323 	/**
324 	 * @brief Construct from 1 scalar value.
325 	 */
vmask4vmask4326 	ASTCENC_SIMD_INLINE explicit vmask4(bool a)
327 	{
328 		m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0));
329 	}
330 
331 	/**
332 	 * @brief Construct from 4 scalar values.
333 	 *
334 	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
335 	 */
vmask4vmask4336 	ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
337 	{
338 		int v[4] {
339 			a == true ? -1 : 0,
340 			b == true ? -1 : 0,
341 			c == true ? -1 : 0,
342 			d == true ? -1 : 0
343 		};
344 
345 		int32x4_t ms = vld1q_s32(v);
346 		m = vreinterpretq_u32_s32(ms);
347 	}
348 
349 	/**
350 	 * @brief Get the scalar from a single lane.
351 	 */
lanevmask4352 	template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const
353 	{
354 		return vgetq_lane_u32(m, l);
355 	}
356 
357 	/**
358 	 * @brief The vector ...
359 	 */
360 	uint32x4_t m;
361 };
362 
363 // ============================================================================
364 // vmask4 operators and functions
365 // ============================================================================
366 
367 /**
368  * @brief Overload: mask union (or).
369  */
370 ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
371 {
372 	return vmask4(vorrq_u32(a.m, b.m));
373 }
374 
375 /**
376  * @brief Overload: mask intersect (and).
377  */
378 ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
379 {
380 	return vmask4(vandq_u32(a.m, b.m));
381 }
382 
383 /**
384  * @brief Overload: mask difference (xor).
385  */
386 ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
387 {
388 	return vmask4(veorq_u32(a.m, b.m));
389 }
390 
391 /**
392  * @brief Overload: mask invert (not).
393  */
394 ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
395 {
396 	return vmask4(vmvnq_u32(a.m));
397 }
398 
399 /**
400  * @brief Return a 4-bit mask code indicating mask status.
401  *
402  * bit0 = lane 0
403  */
mask(vmask4 a)404 ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
405 {
406 	static const int shifta[4] { 0, 1, 2, 3 };
407 	static const int32x4_t shift = vld1q_s32(shifta);
408 
409 	uint32x4_t tmp = vshrq_n_u32(a.m, 31);
410 	return vaddvq_u32(vshlq_u32(tmp, shift));
411 }
412 
413 // ============================================================================
414 // vint4 operators and functions
415 // ============================================================================
416 
417 /**
418  * @brief Overload: vector by vector addition.
419  */
420 ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
421 {
422 	return vint4(vaddq_s32(a.m, b.m));
423 }
424 
425 /**
426  * @brief Overload: vector by vector subtraction.
427  */
428 ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
429 {
430 	return vint4(vsubq_s32(a.m, b.m));
431 }
432 
433 /**
434  * @brief Overload: vector by vector multiplication.
435  */
436 ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
437 {
438 	return vint4(vmulq_s32(a.m, b.m));
439 }
440 
441 /**
442  * @brief Overload: vector bit invert.
443  */
444 ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
445 {
446 	return vint4(vmvnq_s32(a.m));
447 }
448 
449 /**
450  * @brief Overload: vector by vector bitwise or.
451  */
452 ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
453 {
454 	return vint4(vorrq_s32(a.m, b.m));
455 }
456 
457 /**
458  * @brief Overload: vector by vector bitwise and.
459  */
460 ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
461 {
462 	return vint4(vandq_s32(a.m, b.m));
463 }
464 
465 /**
466  * @brief Overload: vector by vector bitwise xor.
467  */
468 ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
469 {
470 	return vint4(veorq_s32(a.m, b.m));
471 }
472 
473 /**
474  * @brief Overload: vector by vector equality.
475  */
476 ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
477 {
478 	return vmask4(vceqq_s32(a.m, b.m));
479 }
480 
481 /**
482  * @brief Overload: vector by vector inequality.
483  */
484 ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
485 {
486 	return ~vmask4(vceqq_s32(a.m, b.m));
487 }
488 
489 /**
490  * @brief Overload: vector by vector less than.
491  */
492 ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
493 {
494 	return vmask4(vcltq_s32(a.m, b.m));
495 }
496 
497 /**
498  * @brief Overload: vector by vector greater than.
499  */
500 ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
501 {
502 	return vmask4(vcgtq_s32(a.m, b.m));
503 }
504 
505 /**
506  * @brief Logical shift left.
507  */
lsl(vint4 a)508 template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
509 {
510 	return vint4(vshlq_s32(a.m, vdupq_n_s32(s)));
511 }
512 
513 /**
514  * @brief Logical shift right.
515  */
lsr(vint4 a)516 template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
517 {
518 	uint32x4_t ua = vreinterpretq_u32_s32(a.m);
519 	ua = vshlq_u32(ua, vdupq_n_s32(-s));
520 	return vint4(vreinterpretq_s32_u32(ua));
521 }
522 
523 /**
524  * @brief Arithmetic shift right.
525  */
asr(vint4 a)526 template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
527 {
528 	return vint4(vshlq_s32(a.m, vdupq_n_s32(-s)));
529 }
530 
531 /**
532  * @brief Return the min vector of two vectors.
533  */
min(vint4 a,vint4 b)534 ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
535 {
536 	return vint4(vminq_s32(a.m, b.m));
537 }
538 
539 /**
540  * @brief Return the max vector of two vectors.
541  */
max(vint4 a,vint4 b)542 ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
543 {
544 	return vint4(vmaxq_s32(a.m, b.m));
545 }
546 
547 /**
548  * @brief Return the horizontal minimum of a vector.
549  */
hmin(vint4 a)550 ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
551 {
552 	return vint4(vminvq_s32(a.m));
553 }
554 
555 /**
556  * @brief Return the horizontal maximum of a vector.
557  */
hmax(vint4 a)558 ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
559 {
560 	return vint4(vmaxvq_s32(a.m));
561 }
562 
563 /**
564  * @brief Return the horizontal sum of a vector.
565  */
hadd_s(vint4 a)566 ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
567 {
568 	int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m));
569 	return vget_lane_s32(vpadd_s32(t, t), 0);
570 }
571 
572 /**
573  * @brief Store a vector to a 16B aligned memory address.
574  */
storea(vint4 a,int * p)575 ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
576 {
577 	vst1q_s32(p, a.m);
578 }
579 
580 /**
581  * @brief Store a vector to an unaligned memory address.
582  */
store(vint4 a,int * p)583 ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
584 {
585 	vst1q_s32(p, a.m);
586 }
587 
588 /**
589  * @brief Store lowest N (vector width) bytes into an unaligned address.
590  */
store_nbytes(vint4 a,uint8_t * p)591 ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
592 {
593 	vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
594 }
595 
596 /**
597  * @brief Gather N (vector width) indices from the array.
598  */
gatheri(const int * base,vint4 indices)599 ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
600 {
601 	alignas(16) int idx[4];
602 	storea(indices, idx);
603 	alignas(16) int vals[4];
604 	vals[0] = base[idx[0]];
605 	vals[1] = base[idx[1]];
606 	vals[2] = base[idx[2]];
607 	vals[3] = base[idx[3]];
608 	return vint4(vals);
609 }
610 
611 /**
612  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
613  */
pack_low_bytes(vint4 a)614 ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
615 {
616 	alignas(16) uint8_t shuf[16] {
617 		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
618 	};
619 	uint8x16_t idx = vld1q_u8(shuf);
620 	int8x16_t av = vreinterpretq_s8_s32(a.m);
621 	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
622 }
623 
624 /**
625  * @brief Return lanes from @c b if @c cond is set, else @c a.
626  */
select(vint4 a,vint4 b,vmask4 cond)627 ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
628 {
629 	return vint4(vbslq_s32(cond.m, b.m, a.m));
630 }
631 
632 // ============================================================================
633 // vfloat4 operators and functions
634 // ============================================================================
635 
636 /**
637  * @brief Overload: vector by vector addition.
638  */
639 ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
640 {
641 	return vfloat4(vaddq_f32(a.m, b.m));
642 }
643 
644 /**
645  * @brief Overload: vector by vector subtraction.
646  */
647 ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
648 {
649 	return vfloat4(vsubq_f32(a.m, b.m));
650 }
651 
652 /**
653  * @brief Overload: vector by vector multiplication.
654  */
655 ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
656 {
657 	return vfloat4(vmulq_f32(a.m, b.m));
658 }
659 
660 /**
661  * @brief Overload: vector by vector division.
662  */
663 ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
664 {
665 	return vfloat4(vdivq_f32(a.m, b.m));
666 }
667 
668 /**
669  * @brief Overload: vector by vector equality.
670  */
671 ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
672 {
673 	return vmask4(vceqq_f32(a.m, b.m));
674 }
675 
676 /**
677  * @brief Overload: vector by vector inequality.
678  */
679 ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
680 {
681 	return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
682 }
683 
684 /**
685  * @brief Overload: vector by vector less than.
686  */
687 ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
688 {
689 	return vmask4(vcltq_f32(a.m, b.m));
690 }
691 
692 /**
693  * @brief Overload: vector by vector greater than.
694  */
695 ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
696 {
697 	return vmask4(vcgtq_f32(a.m, b.m));
698 }
699 
700 /**
701  * @brief Overload: vector by vector less than or equal.
702  */
703 ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
704 {
705 	return vmask4(vcleq_f32(a.m, b.m));
706 }
707 
708 /**
709  * @brief Overload: vector by vector greater than or equal.
710  */
711 ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
712 {
713 	return vmask4(vcgeq_f32(a.m, b.m));
714 }
715 
716 /**
717  * @brief Return the min vector of two vectors.
718  *
719  * If either lane value is NaN, @c b will be returned for that lane.
720  */
min(vfloat4 a,vfloat4 b)721 ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
722 {
723 	// Do not reorder - second operand will return if either is NaN
724 	return vfloat4(vminnmq_f32(a.m, b.m));
725 }
726 
727 /**
728  * @brief Return the max vector of two vectors.
729  *
730  * If either lane value is NaN, @c b will be returned for that lane.
731  */
max(vfloat4 a,vfloat4 b)732 ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
733 {
734 	// Do not reorder - second operand will return if either is NaN
735 	return vfloat4(vmaxnmq_f32(a.m, b.m));
736 }
737 
738 /**
739  * @brief Return the absolute value of the float vector.
740  */
abs(vfloat4 a)741 ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
742 {
743 	float32x4_t zero = vdupq_n_f32(0.0f);
744 	float32x4_t inv = vsubq_f32(zero, a.m);
745 	return vfloat4(vmaxq_f32(a.m, inv));
746 }
747 
748 /**
749  * @brief Return a float rounded to the nearest integer value.
750  */
round(vfloat4 a)751 ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
752 {
753 	return vfloat4(vrndnq_f32(a.m));
754 }
755 
756 /**
757  * @brief Return the horizontal minimum of a vector.
758  */
hmin(vfloat4 a)759 ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
760 {
761 	return vfloat4(vminvq_f32(a.m));
762 }
763 
764 /**
765  * @brief Return the horizontal maximum of a vector.
766  */
hmax(vfloat4 a)767 ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
768 {
769 	return vfloat4(vmaxvq_f32(a.m));
770 }
771 
772 /**
773  * @brief Return the horizontal sum of a vector.
774  */
hadd_s(vfloat4 a)775 ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
776 {
777 	// Perform halving add to ensure invariance; we cannot use vaddqv as this
778 	// does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3).
779 	float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m));
780 	return vget_lane_f32(vpadd_f32(t, t), 0);
781 }
782 
783 /**
784  * @brief Return the sqrt of the lanes in the vector.
785  */
sqrt(vfloat4 a)786 ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
787 {
788 	return vfloat4(vsqrtq_f32(a.m));
789 }
790 
791 /**
792  * @brief Return lanes from @c b if @c cond is set, else @c a.
793  */
select(vfloat4 a,vfloat4 b,vmask4 cond)794 ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
795 {
796 	return vfloat4(vbslq_f32(cond.m, b.m, a.m));
797 }
798 
799 /**
800  * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
801  */
select_msb(vfloat4 a,vfloat4 b,vmask4 cond)802 ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
803 {
804 	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
805 	uint32x4_t mask = vcgeq_u32(cond.m, msb);
806 	return vfloat4(vbslq_f32(mask, b.m, a.m));
807 }
808 
809 /**
810  * @brief Load a vector of gathered results from an array;
811  */
gatherf(const float * base,vint4 indices)812 ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
813 {
814 	alignas(16) int idx[4];
815 	storea(indices, idx);
816 	alignas(16) float vals[4];
817 	vals[0] = base[idx[0]];
818 	vals[1] = base[idx[1]];
819 	vals[2] = base[idx[2]];
820 	vals[3] = base[idx[3]];
821 	return vfloat4(vals);
822 }
823 
824 /**
825  * @brief Store a vector to an unaligned memory address.
826  */
store(vfloat4 a,float * p)827 ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
828 {
829 	vst1q_f32(p, a.m);
830 }
831 
832 /**
833  * @brief Store a vector to a 16B aligned memory address.
834  */
storea(vfloat4 a,float * p)835 ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
836 {
837 	vst1q_f32(p, a.m);
838 }
839 
840 /**
841  * @brief Return a integer value for a float vector, using truncation.
842  */
float_to_int(vfloat4 a)843 ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
844 {
845 	return vint4(vcvtq_s32_f32(a.m));
846 }
847 
848 /**
849  * @brief Return a integer value for a float vector, using round-to-nearest.
850  */
float_to_int_rtn(vfloat4 a)851 ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
852 {
853 	a = round(a);
854 	return vint4(vcvtq_s32_f32(a.m));
855 }
856 
857 /**
858  * @brief Return a float value for an integer vector.
859  */
int_to_float(vint4 a)860 ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
861 {
862 	return vfloat4(vcvtq_f32_s32(a.m));
863 }
864 
865 /**
866  * @brief Return a float16 value for a float vector, using round-to-nearest.
867  */
float_to_float16(vfloat4 a)868 ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
869 {
870 	// Generate float16 value
871 	float16x4_t f16 = vcvt_f16_f32(a.m);
872 
873 	// Convert each 16-bit float pattern to a 32-bit pattern
874 	uint16x4_t u16 = vreinterpret_u16_f16(f16);
875 	uint32x4_t u32 = vmovl_u16(u16);
876 	return vint4(vreinterpretq_s32_u32(u32));
877 }
878 
879 /**
880  * @brief Return a float16 value for a float scalar, using round-to-nearest.
881  */
float_to_float16(float a)882 static inline uint16_t float_to_float16(float a)
883 {
884 	vfloat4 av(a);
885 	return static_cast<uint16_t>(float_to_float16(av).lane<0>());
886 }
887 
888 /**
889  * @brief Return a float value for a float16 vector.
890  */
float16_to_float(vint4 a)891 ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
892 {
893 	// Convert each 32-bit float pattern to a 16-bit pattern
894 	uint32x4_t u32 = vreinterpretq_u32_s32(a.m);
895 	uint16x4_t u16 = vmovn_u32(u32);
896 	float16x4_t f16 = vreinterpret_f16_u16(u16);
897 
898 	// Generate float16 value
899 	return vfloat4(vcvt_f32_f16(f16));
900 }
901 
902 /**
903  * @brief Return a float value for a float16 scalar.
904  */
float16_to_float(uint16_t a)905 ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
906 {
907 	vint4 av(a);
908 	return float16_to_float(av).lane<0>();
909 }
910 
911 /**
912  * @brief Return a float value as an integer bit pattern (i.e. no conversion).
913  *
914  * It is a common trick to convert floats into integer bit patterns, perform
915  * some bit hackery based on knowledge they are IEEE 754 layout, and then
916  * convert them back again. This is the first half of that flip.
917  */
float_as_int(vfloat4 a)918 ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
919 {
920 	return vint4(vreinterpretq_s32_f32(a.m));
921 }
922 
923 /**
924  * @brief Return a integer value as a float bit pattern (i.e. no conversion).
925  *
926  * It is a common trick to convert floats into integer bit patterns, perform
927  * some bit hackery based on knowledge they are IEEE 754 layout, and then
928  * convert them back again. This is the second half of that flip.
929  */
int_as_float(vint4 v)930 ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
931 {
932 	return vfloat4(vreinterpretq_f32_s32(v.m));
933 }
934 
935 /**
936  * @brief Prepare a vtable lookup table for use with the native SIMD size.
937  */
vtable_prepare(vint4 t0,vint4 & t0p)938 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
939 {
940 	t0p = t0;
941 }
942 
943 
944 /**
945  * @brief Prepare a vtable lookup table for use with the native SIMD size.
946  */
vtable_prepare(vint4 t0,vint4 t1,vint4 & t0p,vint4 & t1p)947 ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
948 {
949 	t0p = t0;
950 	t1p = t1;
951 }
952 
953 /**
954  * @brief Prepare a vtable lookup table for use with the native SIMD size.
955  */
vtable_prepare(vint4 t0,vint4 t1,vint4 t2,vint4 t3,vint4 & t0p,vint4 & t1p,vint4 & t2p,vint4 & t3p)956 ASTCENC_SIMD_INLINE void vtable_prepare(
957 	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
958 	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
959 {
960 	t0p = t0;
961 	t1p = t1;
962 	t2p = t2;
963 	t3p = t3;
964 }
965 
966 /**
967  * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
968  */
vtable_8bt_32bi(vint4 t0,vint4 idx)969 ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
970 {
971 	int8x16_t table {
972 		vreinterpretq_s8_s32(t0.m)
973 	};
974 
975 	// Set index byte above max index for unused bytes so table lookup returns zero
976 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
977 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
978 
979 	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
980 }
981 
982 /**
983  * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
984  */
vtable_8bt_32bi(vint4 t0,vint4 t1,vint4 idx)985 ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
986 {
987 	int8x16x2_t table {
988 		vreinterpretq_s8_s32(t0.m),
989 		vreinterpretq_s8_s32(t1.m)
990 	};
991 
992 	// Set index byte above max index for unused bytes so table lookup returns zero
993 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
994 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
995 
996 	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
997 }
998 
999 /**
1000  * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
1001  */
vtable_8bt_32bi(vint4 t0,vint4 t1,vint4 t2,vint4 t3,vint4 idx)1002 ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
1003 {
1004 	int8x16x4_t table {
1005 		vreinterpretq_s8_s32(t0.m),
1006 		vreinterpretq_s8_s32(t1.m),
1007 		vreinterpretq_s8_s32(t2.m),
1008 		vreinterpretq_s8_s32(t3.m)
1009 	};
1010 
1011 	// Set index byte above max index for unused bytes so table lookup returns zero
1012 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1013 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1014 
1015 	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
1016 }
1017 
1018 /**
1019  * @brief Return a vector of interleaved RGBA data.
1020  *
1021  * Input vectors have the value stored in the bottom 8 bits of each lane,
1022  * with high  bits set to zero.
1023  *
1024  * Output vector stores a single RGBA texel packed in each lane.
1025  */
interleave_rgba8(vint4 r,vint4 g,vint4 b,vint4 a)1026 ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
1027 {
1028 	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1029 }
1030 
1031 /**
1032  * @brief Store a vector, skipping masked lanes.
1033  *
1034  * All masked lanes must be at the end of vector, after all non-masked lanes.
1035  */
store_lanes_masked(int * base,vint4 data,vmask4 mask)1036 ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
1037 {
1038 	if (mask.lane<3>())
1039 	{
1040 		store(data, base);
1041 	}
1042 	else if (mask.lane<2>())
1043 	{
1044 		base[0] = data.lane<0>();
1045 		base[1] = data.lane<1>();
1046 		base[2] = data.lane<2>();
1047 	}
1048 	else if (mask.lane<1>())
1049 	{
1050 		base[0] = data.lane<0>();
1051 		base[1] = data.lane<1>();
1052 	}
1053 	else if (mask.lane<0>())
1054 	{
1055 		base[0] = data.lane<0>();
1056 	}
1057 }
1058 
1059 #define ASTCENC_USE_NATIVE_POPCOUNT 1
1060 
1061 /**
1062  * @brief Population bit count.
1063  *
1064  * @param v   The value to population count.
1065  *
1066  * @return The number of 1 bits.
1067  */
popcount(uint64_t v)1068 ASTCENC_SIMD_INLINE int popcount(uint64_t v)
1069 {
1070 	return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
1071 }
1072 
1073 #endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
1074