1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2019-2022 Arm Limited
4 // Copyright 2008 Jose Fonseca
5 //
6 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
7 // use this file except in compliance with the License. You may obtain a copy
8 // of the License at:
9 //
10 //     http://www.apache.org/licenses/LICENSE-2.0
11 //
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15 // License for the specific language governing permissions and limitations
16 // under the License.
17 // ----------------------------------------------------------------------------
18 
19 /*
20  * This module implements vector support for floats, ints, and vector lane
21  * control masks. It provides access to both explicit vector width types, and
22  * flexible N-wide types where N can be determined at compile time.
23  *
24  * The design of this module encourages use of vector length agnostic code, via
25  * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
26  * with that is available at compile time. The current vector width is
27  * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
28  *
29  * Explicit scalar types are acessible via the vint1, vfloat1, vmask1 types.
30  * These are provided primarily for prototyping and algorithm debug of VLA
31  * implementations.
32  *
33  * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
34  * types. These are provided for use by VLA code, but are also expected to be
35  * used as a fixed-width type and will supported a reference C++ fallback for
36  * use on platforms without SIMD intrinsics.
37  *
38  * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
39  * types. These are provide for use by VLA code, and are not expected to be
40  * used as a fixed-width type in normal code. No reference C implementation is
41  * provided on platforms without underlying SIMD intrinsics.
42  *
43  * With the current implementation ISA support is provided for:
44  *
45  *     * 1-wide for scalar reference.
46  *     * 4-wide for Armv8-A NEON.
47  *     * 4-wide for x86-64 SSE2.
48  *     * 4-wide for x86-64 SSE4.1.
49  *     * 8-wide for x86-64 AVX2.
50  */
51 
52 #ifndef ASTC_VECMATHLIB_H_INCLUDED
53 #define ASTC_VECMATHLIB_H_INCLUDED
54 
55 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
56 	#include <immintrin.h>
57 #elif ASTCENC_NEON != 0
58 	#include <arm_neon.h>
59 #endif
60 
61 #if !defined(__clang__) && defined(_MSC_VER)
62 	#define ASTCENC_SIMD_INLINE __forceinline
63 	#define ASTCENC_NO_INLINE
64 #elif defined(__GNUC__) && !defined(__clang__)
65 	#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
66 	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
67 #else
68 	#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
69 	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
70 #endif
71 
72 #if ASTCENC_AVX >= 2
73 	/* If we have AVX2 expose 8-wide VLA. */
74 	#include "astcenc_vecmathlib_sse_4.h"
75 	#include "astcenc_vecmathlib_common_4.h"
76 	#include "astcenc_vecmathlib_avx2_8.h"
77 
78 	#define ASTCENC_SIMD_WIDTH 8
79 
80 	using vfloat = vfloat8;
81 
82 	#if defined(ASTCENC_NO_INVARIANCE)
83 		using vfloatacc = vfloat8;
84 	#else
85 		using vfloatacc = vfloat4;
86 	#endif
87 
88 	using vint = vint8;
89 	using vmask = vmask8;
90 
91 	constexpr auto loada = vfloat8::loada;
92 	constexpr auto load1 = vfloat8::load1;
93 
94 #elif ASTCENC_SSE >= 20
95 	/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
96 	#include "astcenc_vecmathlib_sse_4.h"
97 	#include "astcenc_vecmathlib_common_4.h"
98 
99 	#define ASTCENC_SIMD_WIDTH 4
100 
101 	using vfloat = vfloat4;
102 	using vfloatacc = vfloat4;
103 	using vint = vint4;
104 	using vmask = vmask4;
105 
106 	constexpr auto loada = vfloat4::loada;
107 	constexpr auto load1 = vfloat4::load1;
108 
109 #elif ASTCENC_NEON > 0
110 	/* If we have NEON expose 4-wide VLA. */
111 	#include "astcenc_vecmathlib_neon_4.h"
112 	#include "astcenc_vecmathlib_common_4.h"
113 
114 	#define ASTCENC_SIMD_WIDTH 4
115 
116 	using vfloat = vfloat4;
117 	using vfloatacc = vfloat4;
118 	using vint = vint4;
119 	using vmask = vmask4;
120 
121 	constexpr auto loada = vfloat4::loada;
122 	constexpr auto load1 = vfloat4::load1;
123 
124 #else
125 	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
126 
127 	// Note: We no longer expose the 1-wide scalar fallback because it is not
128 	// invariant with the 4-wide path due to algorithms that use horizontal
129 	// operations that accumulate a local vector sum before accumulating into
130 	// a running sum.
131 	//
132 	// For 4 items adding into an accumulator using 1-wide vectors the sum is:
133 	//
134 	//     result = ((((sum + l0) + l1) + l2) + l3)
135 	//
136     // ... whereas the accumulator for a 4-wide vector sum is:
137 	//
138 	//     result = sum + ((l0 + l2) + (l1 + l3))
139 	//
140 	// In "normal maths" this is the same, but the floating point reassociation
141 	// differences mean that these will not produce the same result.
142 
143 	#include "astcenc_vecmathlib_none_4.h"
144 	#include "astcenc_vecmathlib_common_4.h"
145 
146 	#define ASTCENC_SIMD_WIDTH 4
147 
148 	using vfloat = vfloat4;
149 	using vfloatacc = vfloat4;
150 	using vint = vint4;
151 	using vmask = vmask4;
152 
153 	constexpr auto loada = vfloat4::loada;
154 	constexpr auto load1 = vfloat4::load1;
155 #endif
156 
157 /**
158  * @brief Round a count down to the largest multiple of 8.
159  *
160  * @param count   The unrounded value.
161  *
162  * @return The rounded value.
163  */
round_down_to_simd_multiple_8(unsigned int count)164 ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
165 {
166 	return count & static_cast<unsigned int>(~(8 - 1));
167 }
168 
169 /**
170  * @brief Round a count down to the largest multiple of 4.
171  *
172  * @param count   The unrounded value.
173  *
174  * @return The rounded value.
175  */
round_down_to_simd_multiple_4(unsigned int count)176 ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
177 {
178 	return count & static_cast<unsigned int>(~(4 - 1));
179 }
180 
181 /**
182  * @brief Round a count down to the largest multiple of the SIMD width.
183  *
184  * Assumption that the vector width is a power of two ...
185  *
186  * @param count   The unrounded value.
187  *
188  * @return The rounded value.
189  */
round_down_to_simd_multiple_vla(unsigned int count)190 ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
191 {
192 	return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
193 }
194 
195 /**
196  * @brief Round a count up to the largest multiple of the SIMD width.
197  *
198  * Assumption that the vector width is a power of two ...
199  *
200  * @param count   The unrounded value.
201  *
202  * @return The rounded value.
203  */
round_up_to_simd_multiple_vla(unsigned int count)204 ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
205 {
206 	unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
207 	return multiples * ASTCENC_SIMD_WIDTH;
208 }
209 
210 /**
211  * @brief Return @c a with lanes negated if the @c b lane is negative.
212  */
change_sign(vfloat a,vfloat b)213 ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
214 {
215 	vint ia = float_as_int(a);
216 	vint ib = float_as_int(b);
217 	vint sign_mask(static_cast<int>(0x80000000));
218 	vint r = ia ^ (ib & sign_mask);
219 	return int_as_float(r);
220 }
221 
222 /**
223  * @brief Return fast, but approximate, vector atan(x).
224  *
225  * Max error of this implementation is 0.004883.
226  */
atan(vfloat x)227 ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
228 {
229 	vmask c = abs(x) > vfloat(1.0f);
230 	vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
231 	vfloat y = select(x, vfloat(1.0f) / x, c);
232 	y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
233 	return select(y, z - y, c);
234 }
235 
236 /**
237  * @brief Return fast, but approximate, vector atan2(x, y).
238  */
atan2(vfloat y,vfloat x)239 ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
240 {
241 	vfloat z = atan(abs(y / x));
242 	vmask xmask = vmask(float_as_int(x).m);
243 	return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
244 }
245 
246 /*
247  * @brief Factory that returns a unit length 4 component vfloat4.
248  */
unit4()249 static ASTCENC_SIMD_INLINE vfloat4 unit4()
250 {
251 	return vfloat4(0.5f);
252 }
253 
254 /**
255  * @brief Factory that returns a unit length 3 component vfloat4.
256  */
unit3()257 static ASTCENC_SIMD_INLINE vfloat4 unit3()
258 {
259 	float val = 0.577350258827209473f;
260 	return vfloat4(val, val, val, 0.0f);
261 }
262 
263 /**
264  * @brief Factory that returns a unit length 2 component vfloat4.
265  */
unit2()266 static ASTCENC_SIMD_INLINE vfloat4 unit2()
267 {
268 	float val = 0.707106769084930420f;
269 	return vfloat4(val, val, 0.0f, 0.0f);
270 }
271 
272 /**
273  * @brief Factory that returns a 3 component vfloat4.
274  */
vfloat3(float a,float b,float c)275 static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
276 {
277 	return vfloat4(a, b, c, 0.0f);
278 }
279 
280 /**
281  * @brief Factory that returns a 2 component vfloat4.
282  */
vfloat2(float a,float b)283 static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
284 {
285 	return vfloat4(a, b, 0.0f, 0.0f);
286 }
287 
288 /**
289  * @brief Normalize a non-zero length vector to unit length.
290  */
normalize(vfloat4 a)291 static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
292 {
293 	vfloat4 length = dot(a, a);
294 	return a / sqrt(length);
295 }
296 
297 /**
298  * @brief Normalize a vector, returning @c safe if len is zero.
299  */
normalize_safe(vfloat4 a,vfloat4 safe)300 static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
301 {
302 	vfloat4 length = dot(a, a);
303 	if (length.lane<0>() != 0.0f)
304 	{
305 		return a / sqrt(length);
306 	}
307 
308 	return safe;
309 }
310 
311 
312 
313 #define POLY0(x, c0)                     (                                     c0)
314 #define POLY1(x, c0, c1)                 ((POLY0(x, c1) * x)                 + c0)
315 #define POLY2(x, c0, c1, c2)             ((POLY1(x, c1, c2) * x)             + c0)
316 #define POLY3(x, c0, c1, c2, c3)         ((POLY2(x, c1, c2, c3) * x)         + c0)
317 #define POLY4(x, c0, c1, c2, c3, c4)     ((POLY3(x, c1, c2, c3, c4) * x)     + c0)
318 #define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
319 
320 /**
321  * @brief Compute an approximate exp2(x) for each lane in the vector.
322  *
323  * Based on 5th degree minimax polynomials, ported from this blog
324  * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
325  */
exp2(vfloat4 x)326 static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
327 {
328 	x = clamp(-126.99999f, 129.0f, x);
329 
330 	vint4 ipart = float_to_int(x - 0.5f);
331 	vfloat4 fpart = x - int_to_float(ipart);
332 
333 	// Integer contrib, using 1 << ipart
334 	vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
335 
336 	// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
337 	vfloat4 fexp = POLY5(fpart,
338 	                     9.9999994e-1f,
339 	                     6.9315308e-1f,
340 	                     2.4015361e-1f,
341 	                     5.5826318e-2f,
342 	                     8.9893397e-3f,
343 	                     1.8775767e-3f);
344 
345 	return iexp * fexp;
346 }
347 
348 /**
349  * @brief Compute an approximate log2(x) for each lane in the vector.
350  *
351  * Based on 5th degree minimax polynomials, ported from this blog
352  * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
353  */
log2(vfloat4 x)354 static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
355 {
356 	vint4 exp(0x7F800000);
357 	vint4 mant(0x007FFFFF);
358 	vint4 one(0x3F800000);
359 
360 	vint4 i = float_as_int(x);
361 
362 	vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
363 
364 	vfloat4 m = int_as_float((i & mant) | one);
365 
366 	// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
367 	vfloat4 p = POLY4(m,
368 	                  2.8882704548164776201f,
369 	                 -2.52074962577807006663f,
370 	                  1.48116647521213171641f,
371 	                 -0.465725644288844778798f,
372 	                  0.0596515482674574969533f);
373 
374 	// Increases the polynomial degree, but ensures that log2(1) == 0
375 	p = p * (m - 1.0f);
376 
377 	return p + e;
378 }
379 
380 /**
381  * @brief Compute an approximate pow(x, y) for each lane in the vector.
382  *
383  * Power function based on the exp2(log2(x) * y) transform.
384  */
pow(vfloat4 x,vfloat4 y)385 static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
386 {
387 	vmask4 zero_mask = y == vfloat4(0.0f);
388 	vfloat4 estimate = exp2(log2(x) * y);
389 
390 	// Guarantee that y == 0 returns exactly 1.0f
391 	return select(estimate, vfloat4(1.0f), zero_mask);
392 }
393 
394 /**
395  * @brief Count the leading zeros for each lane in @c a.
396  *
397  * Valid for all data values of @c a; will return a per-lane value [0, 32].
398  */
clz(vint4 a)399 static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
400 {
401 	// This function is a horrible abuse of floating point exponents to convert
402 	// the original integer value into a 2^N encoding we can recover easily.
403 
404 	// Convert to float without risk of rounding up by keeping only top 8 bits.
405 	// This trick is is guranteed to keep top 8 bits and clear the 9th.
406 	a = (~lsr<8>(a)) & a;
407 	a = float_as_int(int_to_float(a));
408 
409 	// Extract and unbias exponent
410 	a = vint4(127 + 31) - lsr<23>(a);
411 
412 	// Clamp result to a valid 32-bit range
413 	return clamp(0, 32, a);
414 }
415 
416 /**
417  * @brief Return lanewise 2^a for each lane in @c a.
418  *
419  * Use of signed int means that this is only valid for values in range [0, 31].
420  */
two_to_the_n(vint4 a)421 static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
422 {
423 	// 2^30 is the largest signed number than can be represented
424 	assert(all(a < vint4(31)));
425 
426 	// This function is a horrible abuse of floating point to use the exponent
427 	// and float conversion to generate a 2^N multiple.
428 
429 	// Bias the exponent
430 	vint4 exp = a + 127;
431 	exp = lsl<23>(exp);
432 
433 	// Reinterpret the bits as a float, and then convert to an int
434 	vfloat4 f = int_as_float(exp);
435 	return float_to_int(f);
436 }
437 
438 /**
439  * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
440  */
unorm16_to_sf16(vint4 p)441 static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
442 {
443 	vint4 fp16_one = vint4(0x3C00);
444 	vint4 fp16_small = lsl<8>(p);
445 
446 	vmask4 is_one = p == vint4(0xFFFF);
447 	vmask4 is_small = p < vint4(4);
448 
449 	// Manually inline clz() on Visual Studio to avoid release build codegen bug
450 	// see https://github.com/ARM-software/astc-encoder/issues/259
451 #if !defined(__clang__) && defined(_MSC_VER)
452 	vint4 a = (~lsr<8>(p)) & p;
453 	a = float_as_int(int_to_float(a));
454 	a = vint4(127 + 31) - lsr<23>(a);
455 	vint4 lz = clamp(0, 32, a) - 16;
456 #else
457 	vint4 lz = clz(p) - 16;
458 #endif
459 
460 	p = p * two_to_the_n(lz + 1);
461 	p = p & vint4(0xFFFF);
462 
463 	p = lsr<6>(p);
464 
465 	p = p | lsl<10>(vint4(14) - lz);
466 
467 	vint4 r = select(p, fp16_one, is_one);
468 	r = select(r, fp16_small, is_small);
469 	return r;
470 }
471 
472 /**
473  * @brief Convert 16-bit LNS to float16.
474  */
lns_to_sf16(vint4 p)475 static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
476 {
477 	vint4 mc = p & 0x7FF;
478 	vint4 ec = lsr<11>(p);
479 
480 	vint4 mc_512 = mc * 3;
481 	vmask4 mask_512 = mc < vint4(512);
482 
483 	vint4 mc_1536 = mc * 4 - 512;
484 	vmask4 mask_1536 = mc < vint4(1536);
485 
486 	vint4 mc_else = mc * 5 - 2048;
487 
488 	vint4 mt = mc_else;
489 	mt = select(mt, mc_1536, mask_1536);
490 	mt = select(mt, mc_512, mask_512);
491 
492 	vint4 res = lsl<10>(ec) | lsr<3>(mt);
493 	return min(res, vint4(0x7BFF));
494 }
495 
496 /**
497  * @brief Extract mantissa and exponent of a float value.
498  *
499  * @param      a      The input value.
500  * @param[out] exp    The output exponent.
501  *
502  * @return The mantissa.
503  */
frexp(vfloat4 a,vint4 & exp)504 static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
505 {
506 	// Interpret the bits as an integer
507 	vint4 ai = float_as_int(a);
508 
509 	// Extract and unbias the exponent
510 	exp = (lsr<23>(ai) & 0xFF) - 126;
511 
512 	// Extract and unbias the mantissa
513 	vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
514 	return int_as_float(manti);
515 }
516 
517 /**
518  * @brief Convert float to 16-bit LNS.
519  */
float_to_lns(vfloat4 a)520 static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
521 {
522 	vint4 exp;
523 	vfloat4 mant = frexp(a, exp);
524 
525 	// Do these early before we start messing about ...
526 	vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
527 	vmask4 mask_infinity = a >= vfloat4(65536.0f);
528 
529 	// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
530 	vmask4 exp_lt_m13 = exp < vint4(-13);
531 
532 	vfloat4 a1a = a * 33554432.0f;
533 	vint4 expa = vint4::zero();
534 
535 	vfloat4 a1b = (mant - 0.5f) * 4096;
536 	vint4 expb = exp + 14;
537 
538 	a = select(a1b, a1a, exp_lt_m13);
539 	exp = select(expb, expa, exp_lt_m13);
540 
541 	vmask4 a_lt_384 = a < vfloat4(384.0f);
542 	vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
543 
544 	vfloat4 a2a = a * (4.0f / 3.0f);
545 	vfloat4 a2b = a + 128.0f;
546 	vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
547 
548 	a = a2c;
549 	a = select(a, a2b, a_lt_1408);
550 	a = select(a, a2a, a_lt_384);
551 
552 	a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
553 
554 	a = select(a, vfloat4(65535.0f), mask_infinity);
555 	a = select(a, vfloat4::zero(), mask_underflow_nan);
556 
557 	return a;
558 }
559 
560 namespace astc
561 {
562 
pow(float x,float y)563 static ASTCENC_SIMD_INLINE float pow(float x, float y)
564 {
565 	return pow(vfloat4(x), vfloat4(y)).lane<0>();
566 }
567 
568 }
569 
570 #endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
571