1 //_____________________________________________________________/\_______________________________________________________________
2 //==============================================================================================================================
3 //
4 //                                              [A] SHADER PORTABILITY 1.20190530
5 //
6 //==============================================================================================================================
7 // LICENSE
8 // =======
9 // Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved.
10 // Copyright (c) <2014> <Michal Drobot>
11 // -------
12 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
13 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
14 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
15 // Software is furnished to do so, subject to the following conditions:
16 // -------
17 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
18 // Software.
19 // -------
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
21 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
22 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
23 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 //------------------------------------------------------------------------------------------------------------------------------
25 // ABOUT
26 // =====
27 // Common central point for high-level shading language and C portability for various shader headers.
28 //------------------------------------------------------------------------------------------------------------------------------
29 // DEFINES
30 // =======
31 // A_CPU ..... Include the CPU related code.
32 // A_GPU ..... Include the GPU related code.
33 // A_GLSL .... Using GLSL.
34 // A_HLSL .... Using HLSL.
35 // A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default).
36 // =======
37 // A_BYTE .... Support 8-bit integer.
38 // A_HALF .... Support 16-bit integer and floating point.
39 // A_LONG .... Support 64-bit integer.
40 // A_DUBL .... Support 64-bit floating point.
41 // =======
42 // A_WAVE .... Support wave-wide operations.
43 //------------------------------------------------------------------------------------------------------------------------------
44 // To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'.
45 //------------------------------------------------------------------------------------------------------------------------------
46 // SIMPLIFIED TYPE SYSTEM
47 // ======================
48 //  - All ints will be unsigned with exception of when signed is required.
49 //  - Type naming simplified and shortened "A<type><#components>",
50 //     - H = 16-bit float (half)
51 //     - F = 32-bit float (float)
52 //     - D = 64-bit float (double)
53 //     - P = 1-bit integer (predicate, not using bool because 'B' is used for byte)
54 //     - B = 8-bit integer (byte)
55 //     - W = 16-bit integer (word)
56 //     - U = 32-bit integer (unsigned)
57 //     - L = 64-bit integer (long)
58 //  - Using "AS<type><#components>" for signed when required.
59 //------------------------------------------------------------------------------------------------------------------------------
60 // TODO
61 // ====
62 //  - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops).
63 //  - Add subgroup ops.
64 //------------------------------------------------------------------------------------------------------------------------------
65 // CHANGE LOG
66 // ==========
67 // 20190531 - Fixed changed to llabs() because long is int on Windows.
68 // 20190530 - Updated for new CPU/GPU portability.
69 // 20190528 - Fix AU1_AH2_x() on HLSL (had incorrectly swapped x and y), fixed asuint() cases.
70 // 20190527 - Added min3/max3 for low precision for HLSL.
71 // 20190526 - Updated with half approximations, added ARsq*(), and ASat*() for CPU.
72 // 20190519 - Added more approximations.
73 // 20190514 - Added long conversions.
74 // 20190513 - Added the real BFI moved the other one to ABfiM().
75 // 20190507 - Added extra remap useful for 2D reductions.
76 // 20190507 - Started adding wave ops, add parabolic sin/cos.
77 // 20190505 - Added ASigned*() and friends, setup more auto-typecast, GLSL extensions, etc.
78 // 20190504 - Added min3/max3 for 32-bit integers.
79 // 20190503 - Added type reinterpretation for half.
80 // 20190416 - Added min3/max3 for half.
81 // 20190405 - Misc bug fixing.
82 // 20190404 - Cleaned up color conversion code. Switched "splat" to shorter naming "type_". Misc bug fixing.
83 //==============================================================================================================================
84 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
85 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
86 //_____________________________________________________________/\_______________________________________________________________
87 //==============================================================================================================================
88 //                                                           COMMON
89 //==============================================================================================================================
90 #define A_2PI 6.28318530718
91 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
92 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
93 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
94 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
95 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
96 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
97 //_____________________________________________________________/\_______________________________________________________________
98 //==============================================================================================================================
99 //
100 //
101 //                                                             CPU
102 //
103 //
104 //==============================================================================================================================
105 // Requires standard C types: stdint.h
106 // Requires a collection of standard math intrinsics.
107 //  - Requires VS2013 when not using GCC to get exp2() and log2().
108 //  - https://blogs.msdn.microsoft.com/vcblog/2013/07/19/c99-library-support-in-visual-studio-2013/
109 //------------------------------------------------------------------------------------------------------------------------------
110 // This provides a minimum subset of functionality compared to the GPU parts.
111 //==============================================================================================================================
112 #ifdef A_CPU
113  // Supporting user defined overrides.
114  #ifndef A_RESTRICT
115   #define A_RESTRICT __restrict
116  #endif
117 //------------------------------------------------------------------------------------------------------------------------------
118  #ifndef A_STATIC
119   #define A_STATIC static
120  #endif
121 //------------------------------------------------------------------------------------------------------------------------------
122  // Same types across CPU and GPU.
123  // Predicate uses 32-bit integer (C friendly bool).
124  typedef uint32_t AP1;
125  typedef float AF1;
126  typedef double AD1;
127  typedef uint8_t AB1;
128  typedef uint16_t AW1;
129  typedef uint32_t AU1;
130  typedef uint64_t AL1;
131  typedef int8_t ASB1;
132  typedef int16_t ASW1;
133  typedef int32_t ASU1;
134  typedef int64_t ASL1;
135 //------------------------------------------------------------------------------------------------------------------------------
136  #define AD1_(a) ((AD1)(a))
137  #define AF1_(a) ((AF1)(a))
138  #define AL1_(a) ((AL1)(a))
139  #define AU1_(a) ((AU1)(a))
140 //------------------------------------------------------------------------------------------------------------------------------
141  #define ASL1_(a) ((ASL1)(a))
142  #define ASU1_(a) ((ASU1)(a))
143 //------------------------------------------------------------------------------------------------------------------------------
AU1_AF1(AF1 a)144  A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;}
145 //------------------------------------------------------------------------------------------------------------------------------
146  #define A_TRUE 1
147  #define A_FALSE 0
148 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
149 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
150 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
151 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
152 //_____________________________________________________________/\_______________________________________________________________
153 //==============================================================================================================================
154 //
155 //                                                       CPU/GPU PORTING
156 //
157 //------------------------------------------------------------------------------------------------------------------------------
158 // Hackary to get CPU and GPU to share all setup code, without duplicate code paths.
159 // Unfortunately this is the level of "ugly" that is required since the languages are very different.
160 // This uses a lower-case prefix for special vector constructs.
161 //  - In C restrict pointers are used.
162 //  - In the shading language, in/inout/out arguments are used.
163 // This depends on the ability to access a vector value in both languages via array syntax (aka color[2]).
164 //==============================================================================================================================
165 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
166 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
167 //_____________________________________________________________/\_______________________________________________________________
168 //==============================================================================================================================
169 //                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
170 //==============================================================================================================================
171  #define retAD2 AD1 *A_RESTRICT
172  #define retAD3 AD1 *A_RESTRICT
173  #define retAD4 AD1 *A_RESTRICT
174  #define retAF2 AF1 *A_RESTRICT
175  #define retAF3 AF1 *A_RESTRICT
176  #define retAF4 AF1 *A_RESTRICT
177  #define retAL2 AL1 *A_RESTRICT
178  #define retAL3 AL1 *A_RESTRICT
179  #define retAL4 AL1 *A_RESTRICT
180  #define retAU2 AU1 *A_RESTRICT
181  #define retAU3 AU1 *A_RESTRICT
182  #define retAU4 AU1 *A_RESTRICT
183 //------------------------------------------------------------------------------------------------------------------------------
184  #define inAD2 AD1 *A_RESTRICT
185  #define inAD3 AD1 *A_RESTRICT
186  #define inAD4 AD1 *A_RESTRICT
187  #define inAF2 AF1 *A_RESTRICT
188  #define inAF3 AF1 *A_RESTRICT
189  #define inAF4 AF1 *A_RESTRICT
190  #define inAL2 AL1 *A_RESTRICT
191  #define inAL3 AL1 *A_RESTRICT
192  #define inAL4 AL1 *A_RESTRICT
193  #define inAU2 AU1 *A_RESTRICT
194  #define inAU3 AU1 *A_RESTRICT
195  #define inAU4 AU1 *A_RESTRICT
196 //------------------------------------------------------------------------------------------------------------------------------
197  #define inoutAD2 AD1 *A_RESTRICT
198  #define inoutAD3 AD1 *A_RESTRICT
199  #define inoutAD4 AD1 *A_RESTRICT
200  #define inoutAF2 AF1 *A_RESTRICT
201  #define inoutAF3 AF1 *A_RESTRICT
202  #define inoutAF4 AF1 *A_RESTRICT
203  #define inoutAL2 AL1 *A_RESTRICT
204  #define inoutAL3 AL1 *A_RESTRICT
205  #define inoutAL4 AL1 *A_RESTRICT
206  #define inoutAU2 AU1 *A_RESTRICT
207  #define inoutAU3 AU1 *A_RESTRICT
208  #define inoutAU4 AU1 *A_RESTRICT
209 //------------------------------------------------------------------------------------------------------------------------------
210  #define outAD2 AD1 *A_RESTRICT
211  #define outAD3 AD1 *A_RESTRICT
212  #define outAD4 AD1 *A_RESTRICT
213  #define outAF2 AF1 *A_RESTRICT
214  #define outAF3 AF1 *A_RESTRICT
215  #define outAF4 AF1 *A_RESTRICT
216  #define outAL2 AL1 *A_RESTRICT
217  #define outAL3 AL1 *A_RESTRICT
218  #define outAL4 AL1 *A_RESTRICT
219  #define outAU2 AU1 *A_RESTRICT
220  #define outAU3 AU1 *A_RESTRICT
221  #define outAU4 AU1 *A_RESTRICT
222 //------------------------------------------------------------------------------------------------------------------------------
223  #define varAD2(x) AD1 x[2]
224  #define varAD3(x) AD1 x[3]
225  #define varAD4(x) AD1 x[4]
226  #define varAF2(x) AF1 x[2]
227  #define varAF3(x) AF1 x[3]
228  #define varAF4(x) AF1 x[4]
229  #define varAL2(x) AL1 x[2]
230  #define varAL3(x) AL1 x[3]
231  #define varAL4(x) AL1 x[4]
232  #define varAU2(x) AU1 x[2]
233  #define varAU3(x) AU1 x[3]
234  #define varAU4(x) AU1 x[4]
235 //------------------------------------------------------------------------------------------------------------------------------
236  #define initAD2(x,y) {x,y}
237  #define initAD3(x,y,z) {x,y,z}
238  #define initAD4(x,y,z,w) {x,y,z,w}
239  #define initAF2(x,y) {x,y}
240  #define initAF3(x,y,z) {x,y,z}
241  #define initAF4(x,y,z,w) {x,y,z,w}
242  #define initAL2(x,y) {x,y}
243  #define initAL3(x,y,z) {x,y,z}
244  #define initAL4(x,y,z,w) {x,y,z,w}
245  #define initAU2(x,y) {x,y}
246  #define initAU3(x,y,z) {x,y,z}
247  #define initAU4(x,y,z,w) {x,y,z,w}
248 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
249 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
250 //_____________________________________________________________/\_______________________________________________________________
251 //==============================================================================================================================
252 //                                                     SCALAR RETURN OPS
253 //------------------------------------------------------------------------------------------------------------------------------
254 // TODO
255 // ====
256 //  - Replace transcendentals with manual versions.
257 //==============================================================================================================================
258  #ifdef A_GCC
AAbsD1(AD1 a)259   A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
AAbsF1(AF1 a)260   A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);}
AAbsSU1(AU1 a)261   A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));}
AAbsSL1(AL1 a)262   A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_labs(ASL1_(a)));}
263  #else
AAbsD1(AD1 a)264   A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);}
AAbsF1(AF1 a)265   A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);}
AAbsSU1(AU1 a)266   A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));}
AAbsSL1(AL1 a)267   A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(llabs(ASL1_(a)));}
268  #endif
269 //------------------------------------------------------------------------------------------------------------------------------
270  #ifdef A_GCC
ACosD1(AD1 a)271   A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);}
ACosF1(AF1 a)272   A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);}
273  #else
ACosD1(AD1 a)274   A_STATIC AD1 ACosD1(AD1 a){return cos(a);}
ACosF1(AF1 a)275   A_STATIC AF1 ACosF1(AF1 a){return cosf(a);}
276  #endif
277 //------------------------------------------------------------------------------------------------------------------------------
ADotD2(inAD2 a,inAD2 b)278  A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];}
ADotD3(inAD3 a,inAD3 b)279  A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
ADotD4(inAD4 a,inAD4 b)280  A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
ADotF2(inAF2 a,inAF2 b)281  A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];}
ADotF3(inAF3 a,inAF3 b)282  A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
ADotF4(inAF4 a,inAF4 b)283  A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
284 //------------------------------------------------------------------------------------------------------------------------------
285  #ifdef A_GCC
AExp2D1(AD1 a)286   A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);}
AExp2F1(AF1 a)287   A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);}
288  #else
AExp2D1(AD1 a)289   A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);}
AExp2F1(AF1 a)290   A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);}
291  #endif
292 //------------------------------------------------------------------------------------------------------------------------------
293  #ifdef A_GCC
AFloorD1(AD1 a)294   A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);}
AFloorF1(AF1 a)295   A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);}
296  #else
AFloorD1(AD1 a)297   A_STATIC AD1 AFloorD1(AD1 a){return floor(a);}
AFloorF1(AF1 a)298   A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);}
299  #endif
300 //------------------------------------------------------------------------------------------------------------------------------
ALerpD1(AD1 a,AD1 b,AD1 c)301  A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);}
ALerpF1(AF1 a,AF1 b,AF1 c)302  A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);}
303 //------------------------------------------------------------------------------------------------------------------------------
304  #ifdef A_GCC
ALog2D1(AD1 a)305   A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);}
ALog2F1(AF1 a)306   A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);}
307  #else
ALog2D1(AD1 a)308   A_STATIC AD1 ALog2D1(AD1 a){return log2(a);}
ALog2F1(AF1 a)309   A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);}
310  #endif
311 //------------------------------------------------------------------------------------------------------------------------------
AMaxD1(AD1 a,AD1 b)312  A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;}
AMaxF1(AF1 a,AF1 b)313  A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;}
AMaxL1(AL1 a,AL1 b)314  A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
AMaxU1(AU1 a,AU1 b)315  A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
316 //------------------------------------------------------------------------------------------------------------------------------
317  // These follow the convention that A integer types don't have signage, until they are operated on.
AMaxSL1(AL1 a,AL1 b)318  A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
AMaxSU1(AU1 a,AU1 b)319  A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
320 //------------------------------------------------------------------------------------------------------------------------------
AMinD1(AD1 a,AD1 b)321  A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;}
AMinF1(AF1 a,AF1 b)322  A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;}
AMinL1(AL1 a,AL1 b)323  A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;}
AMinU1(AU1 a,AU1 b)324  A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;}
325 //------------------------------------------------------------------------------------------------------------------------------
AMinSL1(AL1 a,AL1 b)326  A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;}
AMinSU1(AU1 a,AU1 b)327  A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;}
328 //------------------------------------------------------------------------------------------------------------------------------
ARcpD1(AD1 a)329  A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;}
ARcpF1(AF1 a)330  A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;}
331 //------------------------------------------------------------------------------------------------------------------------------
AShrSL1(AL1 a,AL1 b)332  A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));}
AShrSU1(AU1 a,AU1 b)333  A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));}
334 //------------------------------------------------------------------------------------------------------------------------------
335  #ifdef A_GCC
ASinD1(AD1 a)336   A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);}
ASinF1(AF1 a)337   A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);}
338  #else
ASinD1(AD1 a)339   A_STATIC AD1 ASinD1(AD1 a){return sin(a);}
ASinF1(AF1 a)340   A_STATIC AF1 ASinF1(AF1 a){return sinf(a);}
341  #endif
342 //------------------------------------------------------------------------------------------------------------------------------
343  #ifdef A_GCC
ASqrtD1(AD1 a)344   A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);}
ASqrtF1(AF1 a)345   A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);}
346  #else
ASqrtD1(AD1 a)347   A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);}
ASqrtF1(AF1 a)348   A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);}
349  #endif
350 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
351 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
352 //_____________________________________________________________/\_______________________________________________________________
353 //==============================================================================================================================
354 //                                               SCALAR RETURN OPS - DEPENDENT
355 //==============================================================================================================================
AFractD1(AD1 a)356  A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);}
AFractF1(AF1 a)357  A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);}
358 //------------------------------------------------------------------------------------------------------------------------------
APowD1(AD1 a,AD1 b)359  A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));}
APowF1(AF1 a,AF1 b)360  A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));}
361 //------------------------------------------------------------------------------------------------------------------------------
ARsqD1(AD1 a)362  A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));}
ARsqF1(AF1 a)363  A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));}
364 //------------------------------------------------------------------------------------------------------------------------------
ASatD1(AD1 a)365  A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));}
ASatF1(AF1 a)366  A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));}
367 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
368 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
369 //_____________________________________________________________/\_______________________________________________________________
370 //==============================================================================================================================
371 //                                                         VECTOR OPS
372 //------------------------------------------------------------------------------------------------------------------------------
373 // These are added as needed for production or prototyping, so not necessarily a complete set.
374 // They follow a convention of taking in a destination and also returning the destination value to increase utility.
375 //==============================================================================================================================
opAAbsD2(outAD2 d,inAD2 a)376  A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;}
opAAbsD3(outAD3 d,inAD3 a)377  A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;}
opAAbsD4(outAD4 d,inAD4 a)378  A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;}
379 //------------------------------------------------------------------------------------------------------------------------------
opAAbsF2(outAF2 d,inAF2 a)380  A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;}
opAAbsF3(outAF3 d,inAF3 a)381  A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;}
opAAbsF4(outAF4 d,inAF4 a)382  A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;}
383 //==============================================================================================================================
opAAddD2(outAD2 d,inAD2 a,inAD2 b)384  A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
opAAddD3(outAD3 d,inAD3 a,inAD3 b)385  A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
opAAddD4(outAD4 d,inAD4 a,inAD4 b)386  A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
387 //------------------------------------------------------------------------------------------------------------------------------
opAAddF2(outAF2 d,inAF2 a,inAF2 b)388  A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
opAAddF3(outAF3 d,inAF3 a,inAF3 b)389  A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
opAAddF4(outAF4 d,inAF4 a,inAF4 b)390  A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
391 //==============================================================================================================================
opACpyD2(outAD2 d,inAD2 a)392  A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;}
opACpyD3(outAD3 d,inAD3 a)393  A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
opACpyD4(outAD4 d,inAD4 a)394  A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
395 //------------------------------------------------------------------------------------------------------------------------------
opACpyF2(outAF2 d,inAF2 a)396  A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;}
opACpyF3(outAF3 d,inAF3 a)397  A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
opACpyF4(outAF4 d,inAF4 a)398  A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
399 //==============================================================================================================================
opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c)400  A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;}
opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c)401  A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;}
opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c)402  A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;}
403 //------------------------------------------------------------------------------------------------------------------------------
opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c)404  A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;}
opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c)405  A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;}
opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c)406  A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;}
407 //==============================================================================================================================
opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c)408  A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;}
opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c)409  A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;}
opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c)410  A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;}
411 //------------------------------------------------------------------------------------------------------------------------------
opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c)412  A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;}
opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c)413  A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;}
opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c)414  A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;}
415 //==============================================================================================================================
opAMaxD2(outAD2 d,inAD2 a,inAD2 b)416  A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;}
opAMaxD3(outAD3 d,inAD3 a,inAD3 b)417  A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;}
opAMaxD4(outAD4 d,inAD4 a,inAD4 b)418  A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;}
419 //------------------------------------------------------------------------------------------------------------------------------
opAMaxF2(outAF2 d,inAF2 a,inAF2 b)420  A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;}
opAMaxF3(outAF3 d,inAF3 a,inAF3 b)421  A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;}
opAMaxF4(outAF4 d,inAF4 a,inAF4 b)422  A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;}
423 //==============================================================================================================================
opAMinD2(outAD2 d,inAD2 a,inAD2 b)424  A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;}
opAMinD3(outAD3 d,inAD3 a,inAD3 b)425  A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;}
opAMinD4(outAD4 d,inAD4 a,inAD4 b)426  A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;}
427 //------------------------------------------------------------------------------------------------------------------------------
opAMinF2(outAF2 d,inAF2 a,inAF2 b)428  A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;}
opAMinF3(outAF3 d,inAF3 a,inAF3 b)429  A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;}
opAMinF4(outAF4 d,inAF4 a,inAF4 b)430  A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;}
431 //==============================================================================================================================
opAMulD2(outAD2 d,inAD2 a,inAD2 b)432  A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
opAMulD3(outAD3 d,inAD3 a,inAD3 b)433  A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
opAMulD4(outAD4 d,inAD4 a,inAD4 b)434  A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
435 //------------------------------------------------------------------------------------------------------------------------------
opAMulF2(outAF2 d,inAF2 a,inAF2 b)436  A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
opAMulF3(outAF3 d,inAF3 a,inAF3 b)437  A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
opAMulF4(outAF4 d,inAF4 a,inAF4 b)438  A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
439 //==============================================================================================================================
opAMulOneD2(outAD2 d,inAD2 a,AD1 b)440  A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
opAMulOneD3(outAD3 d,inAD3 a,AD1 b)441  A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
opAMulOneD4(outAD4 d,inAD4 a,AD1 b)442  A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
443 //------------------------------------------------------------------------------------------------------------------------------
opAMulOneF2(outAF2 d,inAF2 a,AF1 b)444  A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
opAMulOneF3(outAF3 d,inAF3 a,AF1 b)445  A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
opAMulOneF4(outAF4 d,inAF4 a,AF1 b)446  A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
447 //==============================================================================================================================
opANegD2(outAD2 d,inAD2 a)448  A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;}
opANegD3(outAD3 d,inAD3 a)449  A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
opANegD4(outAD4 d,inAD4 a)450  A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
451 //------------------------------------------------------------------------------------------------------------------------------
opANegF2(outAF2 d,inAF2 a)452  A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;}
opANegF3(outAF3 d,inAF3 a)453  A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
opANegF4(outAF4 d,inAF4 a)454  A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
455 //==============================================================================================================================
opARcpD2(outAD2 d,inAD2 a)456  A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;}
opARcpD3(outAD3 d,inAD3 a)457  A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;}
opARcpD4(outAD4 d,inAD4 a)458  A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;}
459 //------------------------------------------------------------------------------------------------------------------------------
opARcpF2(outAF2 d,inAF2 a)460  A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;}
opARcpF3(outAF3 d,inAF3 a)461  A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;}
opARcpF4(outAF4 d,inAF4 a)462  A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;}
463 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
464 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
465 //_____________________________________________________________/\_______________________________________________________________
466 //==============================================================================================================================
467 //                                                     HALF FLOAT PACKING
468 //==============================================================================================================================
469  // Convert float to half (in lower 16-bits of output).
470  // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
471  // Supports denormals.
472  // Conversion rules are to make computations possibly "safer" on the GPU,
473  //  -INF & -NaN -> -65504
474  //  +INF & +NaN -> +65504
AU1_AH1_AF1(AF1 f)475  A_STATIC AU1 AU1_AH1_AF1(AF1 f){
476   static AW1 base[512]={
477    0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
478    0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
479    0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
480    0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
481    0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
482    0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
483    0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
484    0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
485    0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
486    0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
487    0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
488    0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
489    0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
490    0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
491    0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
492    0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
493    0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
494    0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
495    0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
496    0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
497    0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
498    0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
499    0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
500    0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
501    0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
502    0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
503    0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
504    0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
505    0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
506    0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
507    0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
508    0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff};
509   static AB1 shift[512]={
510    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
511    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
512    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
513    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
514    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
515    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
516    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
517    0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
518    0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
519    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
520    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
521    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
522    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
523    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
524    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
525    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
526    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
527    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
528    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
529    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
530    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
531    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
532    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
533    0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
534    0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
535    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
536    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
537    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
538    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
539    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
540    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
541    0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18};
542   union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);}
543 //------------------------------------------------------------------------------------------------------------------------------
544  // Used to output packed constant.
AU1_AH2_AF2(inAF2 a)545  A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);}
546 #endif
547 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
548 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
549 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
550 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
551 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
552 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
553 //_____________________________________________________________/\_______________________________________________________________
554 //==============================================================================================================================
555 //
556 //
557 //                                                            GLSL
558 //
559 //
560 //==============================================================================================================================
561 #if defined(A_GLSL) && defined(A_GPU)
562  #ifndef A_SKIP_EXT
563   #ifdef A_HALF
564    #extension GL_EXT_shader_16bit_storage:require
565    #extension GL_EXT_shader_explicit_arithmetic_types:require
566   #endif
567 //------------------------------------------------------------------------------------------------------------------------------
568   #ifdef A_LONG
569    #extension GL_ARB_gpu_shader_int64:require
570    // TODO: Fixme to more portable extension!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
571    #extension GL_NV_shader_atomic_int64:require
572   #endif
573 //------------------------------------------------------------------------------------------------------------------------------
574   #ifdef A_WAVE
575    #extension GL_KHR_shader_subgroup_arithmetic:require
576    #extension GL_KHR_shader_subgroup_ballot:require
577    #extension GL_KHR_shader_subgroup_quad:require
578    #extension GL_KHR_shader_subgroup_shuffle:require
579   #endif
580  #endif
581 //==============================================================================================================================
582  #define AP1 bool
583  #define AP2 bvec2
584  #define AP3 bvec3
585  #define AP4 bvec4
586 //------------------------------------------------------------------------------------------------------------------------------
587  #define AF1 float
588  #define AF2 vec2
589  #define AF3 vec3
590  #define AF4 vec4
591 //------------------------------------------------------------------------------------------------------------------------------
592  #define AU1 uint
593  #define AU2 uvec2
594  #define AU3 uvec3
595  #define AU4 uvec4
596 //------------------------------------------------------------------------------------------------------------------------------
597  #define ASU1 int
598  #define ASU2 ivec2
599  #define ASU3 ivec3
600  #define ASU4 ivec4
601 //==============================================================================================================================
602  #define AF1_AU1(x) uintBitsToFloat(AU1(x))
603  #define AF2_AU2(x) uintBitsToFloat(AU2(x))
604  #define AF3_AU3(x) uintBitsToFloat(AU3(x))
605  #define AF4_AU4(x) uintBitsToFloat(AU4(x))
606 //------------------------------------------------------------------------------------------------------------------------------
607  #define AU1_AF1(x) floatBitsToUint(AF1(x))
608  #define AU2_AF2(x) floatBitsToUint(AF2(x))
609  #define AU3_AF3(x) floatBitsToUint(AF3(x))
610  #define AU4_AF4(x) floatBitsToUint(AF4(x))
611 //------------------------------------------------------------------------------------------------------------------------------
612  #define AU1_AH2_AF2 packHalf2x16
613  #define AU1_AW2Unorm_AF2 packUnorm2x16
614  #define AU1_AB4Unorm_AF4 packUnorm4x8
615 //------------------------------------------------------------------------------------------------------------------------------
616  #define AF2_AH2_AU1 unpackHalf2x16
617  #define AF2_AW2Unorm_AU1 unpackUnorm2x16
618  #define AF4_AB4Unorm_AU1 unpackUnorm4x8
619 //==============================================================================================================================
AF1_x(AF1 a)620  AF1 AF1_x(AF1 a){return AF1(a);}
AF2_x(AF1 a)621  AF2 AF2_x(AF1 a){return AF2(a,a);}
AF3_x(AF1 a)622  AF3 AF3_x(AF1 a){return AF3(a,a,a);}
AF4_x(AF1 a)623  AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
624  #define AF1_(a) AF1_x(AF1(a))
625  #define AF2_(a) AF2_x(AF1(a))
626  #define AF3_(a) AF3_x(AF1(a))
627  #define AF4_(a) AF4_x(AF1(a))
628 //------------------------------------------------------------------------------------------------------------------------------
AU1_x(AU1 a)629  AU1 AU1_x(AU1 a){return AU1(a);}
AU2_x(AU1 a)630  AU2 AU2_x(AU1 a){return AU2(a,a);}
AU3_x(AU1 a)631  AU3 AU3_x(AU1 a){return AU3(a,a,a);}
AU4_x(AU1 a)632  AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
633  #define AU1_(a) AU1_x(AU1(a))
634  #define AU2_(a) AU2_x(AU1(a))
635  #define AU3_(a) AU3_x(AU1(a))
636  #define AU4_(a) AU4_x(AU1(a))
637 //==============================================================================================================================
AAbsSU1(AU1 a)638  AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
AAbsSU2(AU2 a)639  AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
AAbsSU3(AU3 a)640  AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
AAbsSU4(AU4 a)641  AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
642 //------------------------------------------------------------------------------------------------------------------------------
ABfe(AU1 src,AU1 off,AU1 bits)643  AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));}
ABfi(AU1 src,AU1 ins,AU1 mask)644  AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
645  // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
ABfiM(AU1 src,AU1 ins,AU1 bits)646  AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));}
647 //------------------------------------------------------------------------------------------------------------------------------
648  // V_FRACT_F32 (note DX frac() is different).
AFractF1(AF1 x)649  AF1 AFractF1(AF1 x){return fract(x);}
AFractF2(AF2 x)650  AF2 AFractF2(AF2 x){return fract(x);}
AFractF3(AF3 x)651  AF3 AFractF3(AF3 x){return fract(x);}
AFractF4(AF4 x)652  AF4 AFractF4(AF4 x){return fract(x);}
653 //------------------------------------------------------------------------------------------------------------------------------
ALerpF1(AF1 x,AF1 y,AF1 a)654  AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);}
ALerpF2(AF2 x,AF2 y,AF2 a)655  AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);}
ALerpF3(AF3 x,AF3 y,AF3 a)656  AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);}
ALerpF4(AF4 x,AF4 y,AF4 a)657  AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);}
658 //------------------------------------------------------------------------------------------------------------------------------
659  // V_MAX3_F32.
AMax3F1(AF1 x,AF1 y,AF1 z)660  AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
AMax3F2(AF2 x,AF2 y,AF2 z)661  AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
AMax3F3(AF3 x,AF3 y,AF3 z)662  AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
AMax3F4(AF4 x,AF4 y,AF4 z)663  AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
664 //------------------------------------------------------------------------------------------------------------------------------
AMax3SU1(AU1 x,AU1 y,AU1 z)665  AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
AMax3SU2(AU2 x,AU2 y,AU2 z)666  AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
AMax3SU3(AU3 x,AU3 y,AU3 z)667  AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
AMax3SU4(AU4 x,AU4 y,AU4 z)668  AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
669 //------------------------------------------------------------------------------------------------------------------------------
AMax3U1(AU1 x,AU1 y,AU1 z)670  AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
AMax3U2(AU2 x,AU2 y,AU2 z)671  AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
AMax3U3(AU3 x,AU3 y,AU3 z)672  AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
AMax3U4(AU4 x,AU4 y,AU4 z)673  AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
674 //------------------------------------------------------------------------------------------------------------------------------
AMaxSU1(AU1 a,AU1 b)675  AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
AMaxSU2(AU2 a,AU2 b)676  AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
AMaxSU3(AU3 a,AU3 b)677  AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
AMaxSU4(AU4 a,AU4 b)678  AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
679 //------------------------------------------------------------------------------------------------------------------------------
680  // Clamp has an easier pattern match for med3 when some ordering is known.
681  // V_MED3_F32.
AMed3F1(AF1 x,AF1 y,AF1 z)682  AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
AMed3F2(AF2 x,AF2 y,AF2 z)683  AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
AMed3F3(AF3 x,AF3 y,AF3 z)684  AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
AMed3F4(AF4 x,AF4 y,AF4 z)685  AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
686 //------------------------------------------------------------------------------------------------------------------------------
687  // V_MIN3_F32.
AMin3F1(AF1 x,AF1 y,AF1 z)688  AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
AMin3F2(AF2 x,AF2 y,AF2 z)689  AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
AMin3F3(AF3 x,AF3 y,AF3 z)690  AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
AMin3F4(AF4 x,AF4 y,AF4 z)691  AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
692 //------------------------------------------------------------------------------------------------------------------------------
AMin3SU1(AU1 x,AU1 y,AU1 z)693  AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
AMin3SU2(AU2 x,AU2 y,AU2 z)694  AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
AMin3SU3(AU3 x,AU3 y,AU3 z)695  AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
AMin3SU4(AU4 x,AU4 y,AU4 z)696  AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
697 //------------------------------------------------------------------------------------------------------------------------------
AMin3U1(AU1 x,AU1 y,AU1 z)698  AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
AMin3U2(AU2 x,AU2 y,AU2 z)699  AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
AMin3U3(AU3 x,AU3 y,AU3 z)700  AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
AMin3U4(AU4 x,AU4 y,AU4 z)701  AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
702 //------------------------------------------------------------------------------------------------------------------------------
AMinSU1(AU1 a,AU1 b)703  AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
AMinSU2(AU2 a,AU2 b)704  AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
AMinSU3(AU3 a,AU3 b)705  AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
AMinSU4(AU4 a,AU4 b)706  AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
707 //------------------------------------------------------------------------------------------------------------------------------
708  // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
709  // V_COS_F32.
ANCosF1(AF1 x)710  AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
ANCosF2(AF2 x)711  AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
ANCosF3(AF3 x)712  AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
ANCosF4(AF4 x)713  AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
714 //------------------------------------------------------------------------------------------------------------------------------
715  // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
716  // V_SIN_F32.
ANSinF1(AF1 x)717  AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
ANSinF2(AF2 x)718  AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
ANSinF3(AF3 x)719  AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
ANSinF4(AF4 x)720  AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
721 //------------------------------------------------------------------------------------------------------------------------------
ARcpF1(AF1 x)722  AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;}
ARcpF2(AF2 x)723  AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;}
ARcpF3(AF3 x)724  AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;}
ARcpF4(AF4 x)725  AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;}
726 //------------------------------------------------------------------------------------------------------------------------------
ARsqF1(AF1 x)727  AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);}
ARsqF2(AF2 x)728  AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);}
ARsqF3(AF3 x)729  AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);}
ARsqF4(AF4 x)730  AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);}
731 //------------------------------------------------------------------------------------------------------------------------------
ASatF1(AF1 x)732  AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));}
ASatF2(AF2 x)733  AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));}
ASatF3(AF3 x)734  AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));}
ASatF4(AF4 x)735  AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));}
736 //------------------------------------------------------------------------------------------------------------------------------
AShrSU1(AU1 a,AU1 b)737  AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
AShrSU2(AU2 a,AU2 b)738  AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
AShrSU3(AU3 a,AU3 b)739  AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
AShrSU4(AU4 a,AU4 b)740  AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
741 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
742 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
743 //_____________________________________________________________/\_______________________________________________________________
744 //==============================================================================================================================
745 //                                                          GLSL BYTE
746 //==============================================================================================================================
747  #ifdef A_BYTE
748   #define AB1 uint8_t
749   #define AB2 u8vec2
750   #define AB3 u8vec3
751   #define AB4 u8vec4
752 //------------------------------------------------------------------------------------------------------------------------------
753   #define ASB1 int8_t
754   #define ASB2 i8vec2
755   #define ASB3 i8vec3
756   #define ASB4 i8vec4
757 //------------------------------------------------------------------------------------------------------------------------------
AB1_x(AB1 a)758   AB1 AB1_x(AB1 a){return AB1(a);}
AB2_x(AB1 a)759   AB2 AB2_x(AB1 a){return AB2(a,a);}
AB3_x(AB1 a)760   AB3 AB3_x(AB1 a){return AB3(a,a,a);}
AB4_x(AB1 a)761   AB4 AB4_x(AB1 a){return AB4(a,a,a,a);}
762   #define AB1_(a) AB1_x(AB1(a))
763   #define AB2_(a) AB2_x(AB1(a))
764   #define AB3_(a) AB3_x(AB1(a))
765   #define AB4_(a) AB4_x(AB1(a))
766  #endif
767 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
768 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
769 //_____________________________________________________________/\_______________________________________________________________
770 //==============================================================================================================================
771 //                                                          GLSL HALF
772 //==============================================================================================================================
773  #ifdef A_HALF
774   #define AH1 float16_t
775   #define AH2 f16vec2
776   #define AH3 f16vec3
777   #define AH4 f16vec4
778 //------------------------------------------------------------------------------------------------------------------------------
779   #define AW1 uint16_t
780   #define AW2 u16vec2
781   #define AW3 u16vec3
782   #define AW4 u16vec4
783 //------------------------------------------------------------------------------------------------------------------------------
784   #define ASW1 int16_t
785   #define ASW2 i16vec2
786   #define ASW3 i16vec3
787   #define ASW4 i16vec4
788 //==============================================================================================================================
789   #define AH2_AU1(x) unpackFloat2x16(AU1(x))
AH4_AU2_x(AU2 x)790   AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
791   #define AH4_AU2(x) AH4_AU2_x(AU2(x))
792   #define AW2_AU1(x) unpackUint2x16(AU1(x))
793   #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
794 //------------------------------------------------------------------------------------------------------------------------------
795   #define AU1_AH2(x) packFloat2x16(AH2(x))
AU2_AH4_x(AH4 x)796   AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
797   #define AU2_AH4(x) AU2_AH4_x(AH4(x))
798   #define AU1_AW2(x) packUint2x16(AW2(x))
799   #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
800 //==============================================================================================================================
801   #define AW1_AH1(x) halfBitsToUint16(AH1(x))
802   #define AW2_AH2(x) halfBitsToUint16(AH2(x))
803   #define AW3_AH3(x) halfBitsToUint16(AH3(x))
804   #define AW4_AH4(x) halfBitsToUint16(AH4(x))
805 //------------------------------------------------------------------------------------------------------------------------------
806   #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
807   #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
808   #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
809   #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
810 //==============================================================================================================================
AH1_x(AH1 a)811   AH1 AH1_x(AH1 a){return AH1(a);}
AH2_x(AH1 a)812   AH2 AH2_x(AH1 a){return AH2(a,a);}
AH3_x(AH1 a)813   AH3 AH3_x(AH1 a){return AH3(a,a,a);}
AH4_x(AH1 a)814   AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
815   #define AH1_(a) AH1_x(AH1(a))
816   #define AH2_(a) AH2_x(AH1(a))
817   #define AH3_(a) AH3_x(AH1(a))
818   #define AH4_(a) AH4_x(AH1(a))
819 //------------------------------------------------------------------------------------------------------------------------------
AW1_x(AW1 a)820   AW1 AW1_x(AW1 a){return AW1(a);}
AW2_x(AW1 a)821   AW2 AW2_x(AW1 a){return AW2(a,a);}
AW3_x(AW1 a)822   AW3 AW3_x(AW1 a){return AW3(a,a,a);}
AW4_x(AW1 a)823   AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
824   #define AW1_(a) AW1_x(AW1(a))
825   #define AW2_(a) AW2_x(AW1(a))
826   #define AW3_(a) AW3_x(AW1(a))
827   #define AW4_(a) AW4_x(AW1(a))
828 //==============================================================================================================================
AAbsSW1(AW1 a)829   AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
AAbsSW2(AW2 a)830   AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
AAbsSW3(AW3 a)831   AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
AAbsSW4(AW4 a)832   AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
833 //------------------------------------------------------------------------------------------------------------------------------
AFractH1(AH1 x)834   AH1 AFractH1(AH1 x){return fract(x);}
AFractH2(AH2 x)835   AH2 AFractH2(AH2 x){return fract(x);}
AFractH3(AH3 x)836   AH3 AFractH3(AH3 x){return fract(x);}
AFractH4(AH4 x)837   AH4 AFractH4(AH4 x){return fract(x);}
838 //------------------------------------------------------------------------------------------------------------------------------
ALerpH1(AH1 x,AH1 y,AH1 a)839   AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
ALerpH2(AH2 x,AH2 y,AH2 a)840   AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
ALerpH3(AH3 x,AH3 y,AH3 a)841   AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
ALerpH4(AH4 x,AH4 y,AH4 a)842   AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
843 //------------------------------------------------------------------------------------------------------------------------------
844   // No packed version of max3.
AMax3H1(AH1 x,AH1 y,AH1 z)845   AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
AMax3H2(AH2 x,AH2 y,AH2 z)846   AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
AMax3H3(AH3 x,AH3 y,AH3 z)847   AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
AMax3H4(AH4 x,AH4 y,AH4 z)848   AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
849 //------------------------------------------------------------------------------------------------------------------------------
AMaxSW1(AW1 a,AW1 b)850   AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
AMaxSW2(AW2 a,AW2 b)851   AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
AMaxSW3(AW3 a,AW3 b)852   AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
AMaxSW4(AW4 a,AW4 b)853   AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
854 //------------------------------------------------------------------------------------------------------------------------------
855   // No packed version of min3.
AMin3H1(AH1 x,AH1 y,AH1 z)856   AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
AMin3H2(AH2 x,AH2 y,AH2 z)857   AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
AMin3H3(AH3 x,AH3 y,AH3 z)858   AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
AMin3H4(AH4 x,AH4 y,AH4 z)859   AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
860 //------------------------------------------------------------------------------------------------------------------------------
AMinSW1(AW1 a,AW1 b)861   AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
AMinSW2(AW2 a,AW2 b)862   AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
AMinSW3(AW3 a,AW3 b)863   AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
AMinSW4(AW4 a,AW4 b)864   AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
865 //------------------------------------------------------------------------------------------------------------------------------
ARcpH1(AH1 x)866   AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
ARcpH2(AH2 x)867   AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
ARcpH3(AH3 x)868   AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
ARcpH4(AH4 x)869   AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
870 //------------------------------------------------------------------------------------------------------------------------------
ARsqH1(AH1 x)871   AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
ARsqH2(AH2 x)872   AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
ARsqH3(AH3 x)873   AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
ARsqH4(AH4 x)874   AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
875 //------------------------------------------------------------------------------------------------------------------------------
ASatH1(AH1 x)876   AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
ASatH2(AH2 x)877   AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
ASatH3(AH3 x)878   AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
ASatH4(AH4 x)879   AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
880 //------------------------------------------------------------------------------------------------------------------------------
AShrSW1(AW1 a,AW1 b)881   AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
AShrSW2(AW2 a,AW2 b)882   AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
AShrSW3(AW3 a,AW3 b)883   AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
AShrSW4(AW4 a,AW4 b)884   AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
885  #endif
886 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
887 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
888 //_____________________________________________________________/\_______________________________________________________________
889 //==============================================================================================================================
890 //                                                         GLSL DOUBLE
891 //==============================================================================================================================
892  #ifdef A_DUBL
893   #define AD1 double
894   #define AD2 dvec2
895   #define AD3 dvec3
896   #define AD4 dvec4
897 //------------------------------------------------------------------------------------------------------------------------------
AD1_x(AD1 a)898   AD1 AD1_x(AD1 a){return AD1(a);}
AD2_x(AD1 a)899   AD2 AD2_x(AD1 a){return AD2(a,a);}
AD3_x(AD1 a)900   AD3 AD3_x(AD1 a){return AD3(a,a,a);}
AD4_x(AD1 a)901   AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
902   #define AD1_(a) AD1_x(AD1(a))
903   #define AD2_(a) AD2_x(AD1(a))
904   #define AD3_(a) AD3_x(AD1(a))
905   #define AD4_(a) AD4_x(AD1(a))
906 //==============================================================================================================================
AFractD1(AD1 x)907   AD1 AFractD1(AD1 x){return fract(x);}
AFractD2(AD2 x)908   AD2 AFractD2(AD2 x){return fract(x);}
AFractD3(AD3 x)909   AD3 AFractD3(AD3 x){return fract(x);}
AFractD4(AD4 x)910   AD4 AFractD4(AD4 x){return fract(x);}
911 //------------------------------------------------------------------------------------------------------------------------------
ALerpD1(AD1 x,AD1 y,AD1 a)912   AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);}
ALerpD2(AD2 x,AD2 y,AD2 a)913   AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);}
ALerpD3(AD3 x,AD3 y,AD3 a)914   AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);}
ALerpD4(AD4 x,AD4 y,AD4 a)915   AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);}
916 //------------------------------------------------------------------------------------------------------------------------------
ARcpD1(AD1 x)917   AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;}
ARcpD2(AD2 x)918   AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;}
ARcpD3(AD3 x)919   AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;}
ARcpD4(AD4 x)920   AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;}
921 //------------------------------------------------------------------------------------------------------------------------------
ARsqD1(AD1 x)922   AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);}
ARsqD2(AD2 x)923   AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);}
ARsqD3(AD3 x)924   AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);}
ARsqD4(AD4 x)925   AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);}
926 //------------------------------------------------------------------------------------------------------------------------------
ASatD1(AD1 x)927   AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));}
ASatD2(AD2 x)928   AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));}
ASatD3(AD3 x)929   AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));}
ASatD4(AD4 x)930   AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));}
931  #endif
932 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
933 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
934 //_____________________________________________________________/\_______________________________________________________________
935 //==============================================================================================================================
936 //                                                         GLSL LONG
937 //==============================================================================================================================
938  #ifdef A_LONG
939   #define AL1 uint64_t
940   #define AL2 u64vec2
941   #define AL3 u64vec3
942   #define AL4 u64vec4
943 //------------------------------------------------------------------------------------------------------------------------------
944   #define ASL1 int64_t
945   #define ASL2 i64vec2
946   #define ASL3 i64vec3
947   #define ASL4 i64vec4
948 //------------------------------------------------------------------------------------------------------------------------------
949   #define AL1_AU2(x) packUint2x32(AU2(x))
950   #define AU2_AL1(x) unpackUint2x32(AL1(x))
951 //------------------------------------------------------------------------------------------------------------------------------
AL1_x(AL1 a)952   AL1 AL1_x(AL1 a){return AL1(a);}
AL2_x(AL1 a)953   AL2 AL2_x(AL1 a){return AL2(a,a);}
AL3_x(AL1 a)954   AL3 AL3_x(AL1 a){return AL3(a,a,a);}
AL4_x(AL1 a)955   AL4 AL4_x(AL1 a){return AL4(a,a,a,a);}
956   #define AL1_(a) AL1_x(AL1(a))
957   #define AL2_(a) AL2_x(AL1(a))
958   #define AL3_(a) AL3_x(AL1(a))
959   #define AL4_(a) AL4_x(AL1(a))
960 //==============================================================================================================================
AAbsSL1(AL1 a)961   AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));}
AAbsSL2(AL2 a)962   AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));}
AAbsSL3(AL3 a)963   AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));}
AAbsSL4(AL4 a)964   AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));}
965 //------------------------------------------------------------------------------------------------------------------------------
AMaxSL1(AL1 a,AL1 b)966   AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));}
AMaxSL2(AL2 a,AL2 b)967   AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));}
AMaxSL3(AL3 a,AL3 b)968   AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));}
AMaxSL4(AL4 a,AL4 b)969   AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));}
970 //------------------------------------------------------------------------------------------------------------------------------
AMinSL1(AL1 a,AL1 b)971   AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));}
AMinSL2(AL2 a,AL2 b)972   AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));}
AMinSL3(AL3 a,AL3 b)973   AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));}
AMinSL4(AL4 a,AL4 b)974   AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));}
975  #endif
976 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
977 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
978 //_____________________________________________________________/\_______________________________________________________________
979 //==============================================================================================================================
980 //                                                      WAVE OPERATIONS
981 //==============================================================================================================================
982  #ifdef A_WAVE
AWaveAdd(AF1 v)983   AF1 AWaveAdd(AF1 v){return subgroupAdd(v);}
AWaveAdd(AF2 v)984   AF2 AWaveAdd(AF2 v){return subgroupAdd(v);}
AWaveAdd(AF3 v)985   AF3 AWaveAdd(AF3 v){return subgroupAdd(v);}
AWaveAdd(AF4 v)986   AF4 AWaveAdd(AF4 v){return subgroupAdd(v);}
987  #endif
988 //==============================================================================================================================
989 #endif
990 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
991 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
992 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
993 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
994 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
995 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
996 //_____________________________________________________________/\_______________________________________________________________
997 //==============================================================================================================================
998 //
999 //
1000 //                                                            HLSL
1001 //
1002 //
1003 //==============================================================================================================================
1004 #if defined(A_HLSL) && defined(A_GPU)
1005  #define AP1 bool
1006  #define AP2 bool2
1007  #define AP3 bool3
1008  #define AP4 bool4
1009 //------------------------------------------------------------------------------------------------------------------------------
1010  #define AF1 float
1011  #define AF2 float2
1012  #define AF3 float3
1013  #define AF4 float4
1014 //------------------------------------------------------------------------------------------------------------------------------
1015  #define AU1 uint
1016  #define AU2 uint2
1017  #define AU3 uint3
1018  #define AU4 uint4
1019 //------------------------------------------------------------------------------------------------------------------------------
1020  #define ASU1 int
1021  #define ASU2 int2
1022  #define ASU3 int3
1023  #define ASU4 int4
1024 //==============================================================================================================================
1025  #define AF1_AU1(x) asfloat(AU1(x))
1026  #define AF2_AU2(x) asfloat(AU2(x))
1027  #define AF3_AU3(x) asfloat(AU3(x))
1028  #define AF4_AU4(x) asfloat(AU4(x))
1029 //------------------------------------------------------------------------------------------------------------------------------
1030  #define AU1_AF1(x) asuint(AF1(x))
1031  #define AU2_AF2(x) asuint(AF2(x))
1032  #define AU3_AF3(x) asuint(AF3(x))
1033  #define AU4_AF4(x) asuint(AF4(x))
1034 //------------------------------------------------------------------------------------------------------------------------------
AU1_AH2_AF2_x(AF2 a)1035  AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
1036  #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a))
1037  #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
1038 //------------------------------------------------------------------------------------------------------------------------------
AF2_AH2_AU1_x(AU1 x)1039  AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
1040  #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x))
1041 //==============================================================================================================================
AF1_x(AF1 a)1042  AF1 AF1_x(AF1 a){return AF1(a);}
AF2_x(AF1 a)1043  AF2 AF2_x(AF1 a){return AF2(a,a);}
AF3_x(AF1 a)1044  AF3 AF3_x(AF1 a){return AF3(a,a,a);}
AF4_x(AF1 a)1045  AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
1046  #define AF1_(a) AF1_x(AF1(a))
1047  #define AF2_(a) AF2_x(AF1(a))
1048  #define AF3_(a) AF3_x(AF1(a))
1049  #define AF4_(a) AF4_x(AF1(a))
1050 //------------------------------------------------------------------------------------------------------------------------------
AU1_x(AU1 a)1051  AU1 AU1_x(AU1 a){return AU1(a);}
AU2_x(AU1 a)1052  AU2 AU2_x(AU1 a){return AU2(a,a);}
AU3_x(AU1 a)1053  AU3 AU3_x(AU1 a){return AU3(a,a,a);}
AU4_x(AU1 a)1054  AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
1055  #define AU1_(a) AU1_x(AU1(a))
1056  #define AU2_(a) AU2_x(AU1(a))
1057  #define AU3_(a) AU3_x(AU1(a))
1058  #define AU4_(a) AU4_x(AU1(a))
1059 //==============================================================================================================================
AAbsSU1(AU1 a)1060  AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
AAbsSU2(AU2 a)1061  AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
AAbsSU3(AU3 a)1062  AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
AAbsSU4(AU4 a)1063  AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
1064 //------------------------------------------------------------------------------------------------------------------------------
ABfe(AU1 src,AU1 off,AU1 bits)1065  AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1<<bits)-1;return (src>>off)&mask;}
ABfi(AU1 src,AU1 ins,AU1 mask)1066  AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
ABfiM(AU1 src,AU1 ins,AU1 bits)1067  AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1<<bits)-1;return (ins&mask)|(src&(~mask));}
1068 //------------------------------------------------------------------------------------------------------------------------------
AFractF1(AF1 x)1069  AF1 AFractF1(AF1 x){return x-floor(x);}
AFractF2(AF2 x)1070  AF2 AFractF2(AF2 x){return x-floor(x);}
AFractF3(AF3 x)1071  AF3 AFractF3(AF3 x){return x-floor(x);}
AFractF4(AF4 x)1072  AF4 AFractF4(AF4 x){return x-floor(x);}
1073 //------------------------------------------------------------------------------------------------------------------------------
ALerpF1(AF1 x,AF1 y,AF1 a)1074  AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);}
ALerpF2(AF2 x,AF2 y,AF2 a)1075  AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);}
ALerpF3(AF3 x,AF3 y,AF3 a)1076  AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);}
ALerpF4(AF4 x,AF4 y,AF4 a)1077  AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);}
1078 //------------------------------------------------------------------------------------------------------------------------------
AMax3F1(AF1 x,AF1 y,AF1 z)1079  AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
AMax3F2(AF2 x,AF2 y,AF2 z)1080  AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
AMax3F3(AF3 x,AF3 y,AF3 z)1081  AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
AMax3F4(AF4 x,AF4 y,AF4 z)1082  AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
1083 //------------------------------------------------------------------------------------------------------------------------------
AMax3SU1(AU1 x,AU1 y,AU1 z)1084  AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
AMax3SU2(AU2 x,AU2 y,AU2 z)1085  AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
AMax3SU3(AU3 x,AU3 y,AU3 z)1086  AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
AMax3SU4(AU4 x,AU4 y,AU4 z)1087  AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
1088 //------------------------------------------------------------------------------------------------------------------------------
AMax3U1(AU1 x,AU1 y,AU1 z)1089  AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
AMax3U2(AU2 x,AU2 y,AU2 z)1090  AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
AMax3U3(AU3 x,AU3 y,AU3 z)1091  AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
AMax3U4(AU4 x,AU4 y,AU4 z)1092  AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
1093 //------------------------------------------------------------------------------------------------------------------------------
AMaxSU1(AU1 a,AU1 b)1094  AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
AMaxSU2(AU2 a,AU2 b)1095  AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
AMaxSU3(AU3 a,AU3 b)1096  AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
AMaxSU4(AU4 a,AU4 b)1097  AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
1098 //------------------------------------------------------------------------------------------------------------------------------
AMed3F1(AF1 x,AF1 y,AF1 z)1099  AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
AMed3F2(AF2 x,AF2 y,AF2 z)1100  AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
AMed3F3(AF3 x,AF3 y,AF3 z)1101  AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
AMed3F4(AF4 x,AF4 y,AF4 z)1102  AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
1103 //------------------------------------------------------------------------------------------------------------------------------
AMin3F1(AF1 x,AF1 y,AF1 z)1104  AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
AMin3F2(AF2 x,AF2 y,AF2 z)1105  AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
AMin3F3(AF3 x,AF3 y,AF3 z)1106  AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
AMin3F4(AF4 x,AF4 y,AF4 z)1107  AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
1108 //------------------------------------------------------------------------------------------------------------------------------
AMin3SU1(AU1 x,AU1 y,AU1 z)1109  AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
AMin3SU2(AU2 x,AU2 y,AU2 z)1110  AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
AMin3SU3(AU3 x,AU3 y,AU3 z)1111  AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
AMin3SU4(AU4 x,AU4 y,AU4 z)1112  AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
1113 //------------------------------------------------------------------------------------------------------------------------------
AMin3U1(AU1 x,AU1 y,AU1 z)1114  AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
AMin3U2(AU2 x,AU2 y,AU2 z)1115  AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
AMin3U3(AU3 x,AU3 y,AU3 z)1116  AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
AMin3U4(AU4 x,AU4 y,AU4 z)1117  AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
1118 //------------------------------------------------------------------------------------------------------------------------------
AMinSU1(AU1 a,AU1 b)1119  AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
AMinSU2(AU2 a,AU2 b)1120  AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
AMinSU3(AU3 a,AU3 b)1121  AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
AMinSU4(AU4 a,AU4 b)1122  AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
1123 //------------------------------------------------------------------------------------------------------------------------------
ANCosF1(AF1 x)1124  AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
ANCosF2(AF2 x)1125  AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
ANCosF3(AF3 x)1126  AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
ANCosF4(AF4 x)1127  AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
1128 //------------------------------------------------------------------------------------------------------------------------------
ANSinF1(AF1 x)1129  AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
ANSinF2(AF2 x)1130  AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
ANSinF3(AF3 x)1131  AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
ANSinF4(AF4 x)1132  AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
1133 //------------------------------------------------------------------------------------------------------------------------------
ARcpF1(AF1 x)1134  AF1 ARcpF1(AF1 x){return rcp(x);}
ARcpF2(AF2 x)1135  AF2 ARcpF2(AF2 x){return rcp(x);}
ARcpF3(AF3 x)1136  AF3 ARcpF3(AF3 x){return rcp(x);}
ARcpF4(AF4 x)1137  AF4 ARcpF4(AF4 x){return rcp(x);}
1138 //------------------------------------------------------------------------------------------------------------------------------
ARsqF1(AF1 x)1139  AF1 ARsqF1(AF1 x){return rsqrt(x);}
ARsqF2(AF2 x)1140  AF2 ARsqF2(AF2 x){return rsqrt(x);}
ARsqF3(AF3 x)1141  AF3 ARsqF3(AF3 x){return rsqrt(x);}
ARsqF4(AF4 x)1142  AF4 ARsqF4(AF4 x){return rsqrt(x);}
1143 //------------------------------------------------------------------------------------------------------------------------------
ASatF1(AF1 x)1144  AF1 ASatF1(AF1 x){return saturate(x);}
ASatF2(AF2 x)1145  AF2 ASatF2(AF2 x){return saturate(x);}
ASatF3(AF3 x)1146  AF3 ASatF3(AF3 x){return saturate(x);}
ASatF4(AF4 x)1147  AF4 ASatF4(AF4 x){return saturate(x);}
1148 //------------------------------------------------------------------------------------------------------------------------------
AShrSU1(AU1 a,AU1 b)1149  AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
AShrSU2(AU2 a,AU2 b)1150  AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
AShrSU3(AU3 a,AU3 b)1151  AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
AShrSU4(AU4 a,AU4 b)1152  AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
1153 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1154 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1155 //_____________________________________________________________/\_______________________________________________________________
1156 //==============================================================================================================================
1157 //                                                          HLSL BYTE
1158 //==============================================================================================================================
1159  #ifdef A_BYTE
1160  #endif
1161 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1162 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1163 //_____________________________________________________________/\_______________________________________________________________
1164 //==============================================================================================================================
1165 //                                                          HLSL HALF
1166 //==============================================================================================================================
1167  #ifdef A_HALF
1168   #define AH1 min16float
1169   #define AH2 min16float2
1170   #define AH3 min16float3
1171   #define AH4 min16float4
1172 //------------------------------------------------------------------------------------------------------------------------------
1173   #define AW1 min16uint
1174   #define AW2 min16uint2
1175   #define AW3 min16uint3
1176   #define AW4 min16uint4
1177 //------------------------------------------------------------------------------------------------------------------------------
1178   #define ASW1 min16int
1179   #define ASW2 min16int2
1180   #define ASW3 min16int3
1181   #define ASW4 min16int4
1182 //==============================================================================================================================
1183   // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
1184   // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
AH2_AU1_x(AU1 x)1185   AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
AH4_AU2_x(AU2 x)1186   AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
AW2_AU1_x(AU1 x)1187   AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
AW4_AU2_x(AU2 x)1188   AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
1189   #define AH2_AU1(x) AH2_AU1_x(AU1(x))
1190   #define AH4_AU2(x) AH4_AU2_x(AU2(x))
1191   #define AW2_AU1(x) AW2_AU1_x(AU1(x))
1192   #define AW4_AU2(x) AW4_AU2_x(AU2(x))
1193 //------------------------------------------------------------------------------------------------------------------------------
AU1_AH2_x(AH2 x)1194   AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
AU2_AH4_x(AH4 x)1195   AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
AU1_AW2_x(AW2 x)1196   AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
AU2_AW4_x(AW4 x)1197   AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
1198   #define AU1_AH2(x) AU1_AH2_x(AH2(x))
1199   #define AU2_AH4(x) AU2_AH4_x(AH4(x))
1200   #define AU1_AW2(x) AU1_AW2_x(AW2(x))
1201   #define AU2_AW4(x) AU2_AW4_x(AW4(x))
1202 //==============================================================================================================================
1203   // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1204   #define AW1_AH1(x) AW1(asuint(AF1(x)))
1205   #define AW2_AH2(x) AW2(asuint(AF2(x)))
1206   #define AW3_AH3(x) AW3(asuint(AF3(x)))
1207   #define AW4_AH4(x) AW4(asuint(AF4(x)))
1208 //------------------------------------------------------------------------------------------------------------------------------
1209   // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1210   #define AH1_AW1(x) AH1(asfloat(AU1(x)))
1211   #define AH2_AW2(x) AH2(asfloat(AU2(x)))
1212   #define AH3_AW3(x) AH3(asfloat(AU3(x)))
1213   #define AH4_AW4(x) AH4(asfloat(AU4(x)))
1214 //==============================================================================================================================
AH1_x(AH1 a)1215   AH1 AH1_x(AH1 a){return AH1(a);}
AH2_x(AH1 a)1216   AH2 AH2_x(AH1 a){return AH2(a,a);}
AH3_x(AH1 a)1217   AH3 AH3_x(AH1 a){return AH3(a,a,a);}
AH4_x(AH1 a)1218   AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
1219   #define AH1_(a) AH1_x(AH1(a))
1220   #define AH2_(a) AH2_x(AH1(a))
1221   #define AH3_(a) AH3_x(AH1(a))
1222   #define AH4_(a) AH4_x(AH1(a))
1223 //------------------------------------------------------------------------------------------------------------------------------
AW1_x(AW1 a)1224   AW1 AW1_x(AW1 a){return AW1(a);}
AW2_x(AW1 a)1225   AW2 AW2_x(AW1 a){return AW2(a,a);}
AW3_x(AW1 a)1226   AW3 AW3_x(AW1 a){return AW3(a,a,a);}
AW4_x(AW1 a)1227   AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
1228   #define AW1_(a) AW1_x(AW1(a))
1229   #define AW2_(a) AW2_x(AW1(a))
1230   #define AW3_(a) AW3_x(AW1(a))
1231   #define AW4_(a) AW4_x(AW1(a))
1232 //==============================================================================================================================
AAbsSW1(AW1 a)1233   AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
AAbsSW2(AW2 a)1234   AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
AAbsSW3(AW3 a)1235   AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
AAbsSW4(AW4 a)1236   AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
1237 //------------------------------------------------------------------------------------------------------------------------------
1238  // V_FRACT_F16 (note DX frac() is different).
AFractH1(AH1 x)1239   AH1 AFractH1(AH1 x){return x-floor(x);}
AFractH2(AH2 x)1240   AH2 AFractH2(AH2 x){return x-floor(x);}
AFractH3(AH3 x)1241   AH3 AFractH3(AH3 x){return x-floor(x);}
AFractH4(AH4 x)1242   AH4 AFractH4(AH4 x){return x-floor(x);}
1243 //------------------------------------------------------------------------------------------------------------------------------
ALerpH1(AH1 x,AH1 y,AH1 a)1244   AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
ALerpH2(AH2 x,AH2 y,AH2 a)1245   AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
ALerpH3(AH3 x,AH3 y,AH3 a)1246   AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
ALerpH4(AH4 x,AH4 y,AH4 a)1247   AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
1248 //------------------------------------------------------------------------------------------------------------------------------
AMax3H1(AH1 x,AH1 y,AH1 z)1249   AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
AMax3H2(AH2 x,AH2 y,AH2 z)1250   AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
AMax3H3(AH3 x,AH3 y,AH3 z)1251   AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
AMax3H4(AH4 x,AH4 y,AH4 z)1252   AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
1253 //------------------------------------------------------------------------------------------------------------------------------
AMaxSW1(AW1 a,AW1 b)1254   AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
AMaxSW2(AW2 a,AW2 b)1255   AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
AMaxSW3(AW3 a,AW3 b)1256   AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
AMaxSW4(AW4 a,AW4 b)1257   AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
1258 //------------------------------------------------------------------------------------------------------------------------------
AMin3H1(AH1 x,AH1 y,AH1 z)1259   AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
AMin3H2(AH2 x,AH2 y,AH2 z)1260   AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
AMin3H3(AH3 x,AH3 y,AH3 z)1261   AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
AMin3H4(AH4 x,AH4 y,AH4 z)1262   AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
1263 //------------------------------------------------------------------------------------------------------------------------------
AMinSW1(AW1 a,AW1 b)1264   AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
AMinSW2(AW2 a,AW2 b)1265   AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
AMinSW3(AW3 a,AW3 b)1266   AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
AMinSW4(AW4 a,AW4 b)1267   AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
1268 //------------------------------------------------------------------------------------------------------------------------------
ARcpH1(AH1 x)1269   AH1 ARcpH1(AH1 x){return rcp(x);}
ARcpH2(AH2 x)1270   AH2 ARcpH2(AH2 x){return rcp(x);}
ARcpH3(AH3 x)1271   AH3 ARcpH3(AH3 x){return rcp(x);}
ARcpH4(AH4 x)1272   AH4 ARcpH4(AH4 x){return rcp(x);}
1273 //------------------------------------------------------------------------------------------------------------------------------
ARsqH1(AH1 x)1274   AH1 ARsqH1(AH1 x){return rsqrt(x);}
ARsqH2(AH2 x)1275   AH2 ARsqH2(AH2 x){return rsqrt(x);}
ARsqH3(AH3 x)1276   AH3 ARsqH3(AH3 x){return rsqrt(x);}
ARsqH4(AH4 x)1277   AH4 ARsqH4(AH4 x){return rsqrt(x);}
1278 //------------------------------------------------------------------------------------------------------------------------------
ASatH1(AH1 x)1279   AH1 ASatH1(AH1 x){return saturate(x);}
ASatH2(AH2 x)1280   AH2 ASatH2(AH2 x){return saturate(x);}
ASatH3(AH3 x)1281   AH3 ASatH3(AH3 x){return saturate(x);}
ASatH4(AH4 x)1282   AH4 ASatH4(AH4 x){return saturate(x);}
1283 //------------------------------------------------------------------------------------------------------------------------------
AShrSW1(AW1 a,AW1 b)1284   AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
AShrSW2(AW2 a,AW2 b)1285   AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
AShrSW3(AW3 a,AW3 b)1286   AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
AShrSW4(AW4 a,AW4 b)1287   AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
1288  #endif
1289 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1290 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1291 //_____________________________________________________________/\_______________________________________________________________
1292 //==============================================================================================================================
1293 //                                                         HLSL DOUBLE
1294 //==============================================================================================================================
1295  #ifdef A_DUBL
1296   #define AD1 double
1297   #define AD2 double2
1298   #define AD3 double3
1299   #define AD4 double4
1300 //------------------------------------------------------------------------------------------------------------------------------
AD1_x(AD1 a)1301   AD1 AD1_x(AD1 a){return AD1(a);}
AD2_x(AD1 a)1302   AD2 AD2_x(AD1 a){return AD2(a,a);}
AD3_x(AD1 a)1303   AD3 AD3_x(AD1 a){return AD3(a,a,a);}
AD4_x(AD1 a)1304   AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
1305   #define AD1_(a) AD1_x(AD1(a))
1306   #define AD2_(a) AD2_x(AD1(a))
1307   #define AD3_(a) AD3_x(AD1(a))
1308   #define AD4_(a) AD4_x(AD1(a))
1309 //==============================================================================================================================
AFractD1(AD1 a)1310   AD1 AFractD1(AD1 a){return a-floor(a);}
AFractD2(AD2 a)1311   AD2 AFractD2(AD2 a){return a-floor(a);}
AFractD3(AD3 a)1312   AD3 AFractD3(AD3 a){return a-floor(a);}
AFractD4(AD4 a)1313   AD4 AFractD4(AD4 a){return a-floor(a);}
1314 //------------------------------------------------------------------------------------------------------------------------------
ALerpD1(AD1 x,AD1 y,AD1 a)1315   AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);}
ALerpD2(AD2 x,AD2 y,AD2 a)1316   AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);}
ALerpD3(AD3 x,AD3 y,AD3 a)1317   AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);}
ALerpD4(AD4 x,AD4 y,AD4 a)1318   AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);}
1319 //------------------------------------------------------------------------------------------------------------------------------
ARcpD1(AD1 x)1320   AD1 ARcpD1(AD1 x){return rcp(x);}
ARcpD2(AD2 x)1321   AD2 ARcpD2(AD2 x){return rcp(x);}
ARcpD3(AD3 x)1322   AD3 ARcpD3(AD3 x){return rcp(x);}
ARcpD4(AD4 x)1323   AD4 ARcpD4(AD4 x){return rcp(x);}
1324 //------------------------------------------------------------------------------------------------------------------------------
ARsqD1(AD1 x)1325   AD1 ARsqD1(AD1 x){return rsqrt(x);}
ARsqD2(AD2 x)1326   AD2 ARsqD2(AD2 x){return rsqrt(x);}
ARsqD3(AD3 x)1327   AD3 ARsqD3(AD3 x){return rsqrt(x);}
ARsqD4(AD4 x)1328   AD4 ARsqD4(AD4 x){return rsqrt(x);}
1329 //------------------------------------------------------------------------------------------------------------------------------
ASatD1(AD1 x)1330   AD1 ASatD1(AD1 x){return saturate(x);}
ASatD2(AD2 x)1331   AD2 ASatD2(AD2 x){return saturate(x);}
ASatD3(AD3 x)1332   AD3 ASatD3(AD3 x){return saturate(x);}
ASatD4(AD4 x)1333   AD4 ASatD4(AD4 x){return saturate(x);}
1334  #endif
1335 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1336 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1337 //_____________________________________________________________/\_______________________________________________________________
1338 //==============================================================================================================================
1339 //                                                         HLSL LONG
1340 //==============================================================================================================================
1341  #ifdef A_LONG
1342  #endif
1343 //==============================================================================================================================
1344 #endif
1345 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1346 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1347 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1348 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1349 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1350 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1351 //_____________________________________________________________/\_______________________________________________________________
1352 //==============================================================================================================================
1353 //
1354 //
1355 //                                                          GPU COMMON
1356 //
1357 //
1358 //==============================================================================================================================
1359 #ifdef A_GPU
1360  // Negative and positive infinity.
1361  #define A_INFN_F AF1_AU1(0x7f800000u)
1362  #define A_INFP_F AF1_AU1(0xff800000u)
1363 //------------------------------------------------------------------------------------------------------------------------------
1364  // Copy sign from 's' to positive 'd'.
ACpySgnF1(AF1 d,AF1 s)1365  AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));}
ACpySgnF2(AF2 d,AF2 s)1366  AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));}
ACpySgnF3(AF3 d,AF3 s)1367  AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));}
ACpySgnF4(AF4 d,AF4 s)1368  AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));}
1369 //------------------------------------------------------------------------------------------------------------------------------
1370  // Single operation to return (useful to create a mask to use in lerp for branch free logic),
1371  //  m=NaN := 0
1372  //  m>=0  := 0
1373  //  m<0   := 1
1374  // Uses the following useful floating point logic,
1375  //  saturate(+a*(-INF)==-INF) := 0
1376  //  saturate( 0*(-INF)== NaN) := 0
1377  //  saturate(-a*(-INF)==+INF) := 1
ASignedF1(AF1 m)1378  AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));}
ASignedF2(AF2 m)1379  AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));}
ASignedF3(AF3 m)1380  AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
ASignedF4(AF4 m)1381  AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
1382 //==============================================================================================================================
1383  #ifdef A_HALF
1384   #define A_INFN_H AH1_AW1(0x7c00u)
1385   #define A_INFP_H AH1_AW1(0xfc00u)
1386 //------------------------------------------------------------------------------------------------------------------------------
ACpySgnH1(AH1 d,AH1 s)1387   AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
ACpySgnH2(AH2 d,AH2 s)1388   AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
ACpySgnH3(AH3 d,AH3 s)1389   AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
ACpySgnH4(AH4 d,AH4 s)1390   AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
1391 //------------------------------------------------------------------------------------------------------------------------------
ASignedH1(AH1 m)1392   AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
ASignedH2(AH2 m)1393   AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
ASignedH3(AH3 m)1394   AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
ASignedH4(AH4 m)1395   AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
1396  #endif
1397 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1398 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1399 //_____________________________________________________________/\_______________________________________________________________
1400 //==============================================================================================================================
1401 //                                                     HALF APPROXIMATIONS
1402 //------------------------------------------------------------------------------------------------------------------------------
1403 // These support only positive inputs.
1404 // Did not see value yet in specialization for range.
1405 // Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
1406 // With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
1407 // However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
1408 // And co-execution would require a compiler interleaving a lot of independent work for packed usage.
1409 //------------------------------------------------------------------------------------------------------------------------------
1410 // The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
1411 // Same with sqrt(), as this could be x*rsq() (7 ops).
1412 //------------------------------------------------------------------------------------------------------------------------------
1413 // IDEAS
1414 // =====
1415 //  - Polaris hardware has 16-bit support, but non-double rate.
1416 //    Could be possible still get part double rate for some of this logic,
1417 //    by clearing out the lower half's sign when necessary and using 32-bit ops...
1418 //==============================================================================================================================
1419  #ifdef A_HALF
1420   // Minimize squared error across full positive range, 2 ops.
1421   // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
APrxLoSqrtH1(AH1 a)1422   AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
APrxLoSqrtH2(AH2 a)1423   AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
1424 //------------------------------------------------------------------------------------------------------------------------------
1425   // Lower precision estimation, 1 op.
1426   // Minimize squared error across {smallest normal to 16384.0}.
APrxLoRcpH1(AH1 a)1427   AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
APrxLoRcpH2(AH2 a)1428   AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
1429 //------------------------------------------------------------------------------------------------------------------------------
1430   // Medium precision estimation, one Newton Raphson iteration, 3 ops.
APrxMedRcpH1(AH1 a)1431   AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
APrxMedRcpH2(AH2 a)1432   AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
1433 //------------------------------------------------------------------------------------------------------------------------------
1434   // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
APrxLoRsqH1(AH1 a)1435   AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
APrxLoRsqH2(AH2 a)1436   AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
1437  #endif
1438 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1439 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1440 //_____________________________________________________________/\_______________________________________________________________
1441 //==============================================================================================================================
1442 //                                                    FLOAT APPROXIMATIONS
1443 //------------------------------------------------------------------------------------------------------------------------------
1444 // Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN",
1445 //  - Idea dates back to SGI, then to Quake 3, etc.
1446 //  - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
1447 //     - sqrt(x)=rsqrt(x)*x
1448 //     - rcp(x)=rsqrt(x)*rsqrt(x) for positive x
1449 //  - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h
1450 //------------------------------------------------------------------------------------------------------------------------------
1451 // These below are from perhaps less complete searching for optimal.
1452 // Used FP16 normal range for testing with +4096 32-bit step size for sampling error.
1453 // So these match up well with the half approximations.
1454 //==============================================================================================================================
APrxLoSqrtF1(AF1 a)1455  AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));}
APrxLoRcpF1(AF1 a)1456  AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));}
APrxMedRcpF1(AF1 a)1457  AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));}
APrxLoRsqF1(AF1 a)1458  AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));}
1459 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1460 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1461 //_____________________________________________________________/\_______________________________________________________________
1462 //==============================================================================================================================
1463 //                                                    PARABOLIC SIN & COS
1464 //------------------------------------------------------------------------------------------------------------------------------
1465 // Approximate answers to transcendental questions.
1466 //------------------------------------------------------------------------------------------------------------------------------
1467 // TODO
1468 // ====
1469 //  - Verify packed math ABS is correctly doing an AND.
1470 //==============================================================================================================================
1471  // Valid input range is {-1 to 1} representing {0 to 2 pi}.
1472  // Output range is {-1/4 to -1/4} representing {-1 to 1}.
APSinF1(AF1 x)1473  AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD.
APCosF1(AF1 x)1474  AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT
1475 //------------------------------------------------------------------------------------------------------------------------------
1476  #ifdef A_HALF
1477   // For a packed {sin,cos} pair,
1478   //  - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
1479   //  - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
APSinH2(AH2 x)1480   AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
APCosH2(AH2 x)1481   AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND
1482  #endif
1483 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1484 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1485 //_____________________________________________________________/\_______________________________________________________________
1486 //==============================================================================================================================
1487 //                                                      COLOR CONVERSIONS
1488 //------------------------------------------------------------------------------------------------------------------------------
1489 // These are all linear to/from some other space (where 'linear' has been shortened out of the function name).
1490 // So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'.
1491 // These are branch free implementations.
1492 // The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion.
1493 //------------------------------------------------------------------------------------------------------------------------------
1494 // TRANSFER FUNCTIONS
1495 // ==================
1496 // 709 ..... Rec709 used for some HDTVs
1497 // Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native
1498 // Pq ...... PQ native for HDR10
1499 // Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type
1500 // Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations)
1501 //------------------------------------------------------------------------------------------------------------------------------
1502 // FOR PQ
1503 // ======
1504 // Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2.
1505 // All constants are only specified to FP32 precision.
1506 // External PQ source reference,
1507 //  - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl
1508 //------------------------------------------------------------------------------------------------------------------------------
1509 // PACKED VERSIONS
1510 // ===============
1511 // These are the A*H2() functions.
1512 // There is no PQ functions as FP16 seemed to not have enough precision for the conversion.
1513 // The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors.
1514 // Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least).
1515 //------------------------------------------------------------------------------------------------------------------------------
1516 // NOTES
1517 // =====
1518 // Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case.
1519 //==============================================================================================================================
ATo709F1(AF1 c)1520  AF1 ATo709F1(AF1 c){return max(min(c*AF1_(4.5),AF1_(0.018)),AF1_(1.099)*pow(c,AF1_(0.45))-AF1_(0.099));}
1521 //------------------------------------------------------------------------------------------------------------------------------
1522  // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
AToGammaF1(AF1 c,AF1 rcpX)1523  AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,rcpX);}
1524 //------------------------------------------------------------------------------------------------------------------------------
AToPqF1(AF1 x)1525  AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
1526   return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
1527 //------------------------------------------------------------------------------------------------------------------------------
AToSrgbF1(AF1 c)1528  AF1 AToSrgbF1(AF1 c){return max(min(c*AF1_(12.92),AF1_(0.0031308)),AF1_(1.055)*pow(c,AF1_(0.41666))-AF1_(0.055));}
1529 //------------------------------------------------------------------------------------------------------------------------------
AToTwoF1(AF1 c)1530  AF1 AToTwoF1(AF1 c){return sqrt(c);}
1531 //==============================================================================================================================
AFrom709F1(AF1 c)1532  AF1 AFrom709F1(AF1 c){return max(min(c*AF1_(1.0/4.5),AF1_(0.081)),
1533   pow((c+AF1_(0.099))*(AF1_(1.0)/(AF1_(1.099))),AF1_(1.0/0.45)));}
1534 //------------------------------------------------------------------------------------------------------------------------------
AFromGammaF1(AF1 c,AF1 x)1535  AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,x);}
1536 //------------------------------------------------------------------------------------------------------------------------------
AFromPqF1(AF1 x)1537  AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
1538   return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
1539 //------------------------------------------------------------------------------------------------------------------------------
AFromSrgbF1(AF1 c)1540  AF1 AFromSrgbF1(AF1 c){return max(min(c*AF1_(1.0/12.92),AF1_(0.04045)),
1541   pow((c+AF1_(0.055))*(AF1_(1.0)/AF1_(1.055)),AF1_(2.4)));}
1542 //------------------------------------------------------------------------------------------------------------------------------
AFromTwoF1(AF1 c)1543  AF1 AFromTwoF1(AF1 c){return c*c;}
1544 //==============================================================================================================================
1545  #ifdef A_HALF
ATo709H2(AH2 c)1546   AH2 ATo709H2(AH2 c){return max(min(c*AH2_(4.5),AH2_(0.018)),AH2_(1.099)*pow(c,AH2_(0.45))-AH2_(0.099));}
1547 //------------------------------------------------------------------------------------------------------------------------------
AToGammaH2(AH2 c,AH1 rcpX)1548   AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));}
1549 //------------------------------------------------------------------------------------------------------------------------------
AToSrgbH2(AH2 c)1550   AH2 AToSrgbH2(AH2 c){return max(min(c*AH2_(12.92),AH2_(0.0031308)),AH2_(1.055)*pow(c,AH2_(0.41666))-AH2_(0.055));}
1551 //------------------------------------------------------------------------------------------------------------------------------
AToTwoH2(AH2 c)1552   AH2 AToTwoH2(AH2 c){return sqrt(c);}
1553  #endif
1554 //==============================================================================================================================
1555  #ifdef A_HALF
AFrom709H2(AH2 c)1556   AH2 AFrom709H2(AH2 c){return max(min(c*AH2_(1.0/4.5),AH2_(0.081)),
1557    pow((c+AH2_(0.099))*(AH2_(1.0)/(AH2_(1.099))),AH2_(1.0/0.45)));}
1558 //------------------------------------------------------------------------------------------------------------------------------
AFromGammaH2(AH2 c,AH1 x)1559   AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
1560 //------------------------------------------------------------------------------------------------------------------------------
AFromSrgbH2(AH2 c)1561   AH2 AFromSrgbH2(AH2 c){return max(min(c*AH2_(1.0/12.92),AH2_(0.04045)),
1562    pow((c+AH2_(0.055))*(AH2_(1.0)/AH2_(1.055)),AH2_(2.4)));}
1563 //------------------------------------------------------------------------------------------------------------------------------
AFromTwoH2(AH2 c)1564   AH2 AFromTwoH2(AH2 c){return c*c;}
1565  #endif
1566 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1567 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1568 //_____________________________________________________________/\_______________________________________________________________
1569 //==============================================================================================================================
1570 //                                                          CS REMAP
1571 //==============================================================================================================================
1572  // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear.
1573  //  543210
1574  //  ======
1575  //  ..xxx.
1576  //  yy...y
ARmp8x8(AU1 a)1577  AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
1578 //==============================================================================================================================
1579  // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions.
1580  //  543210
1581  //  ======
1582  //  .xx..x
1583  //  y..yy.
1584  // Details,
1585  //  LANE TO 8x8 MAPPING
1586  //  ===================
1587  //  00 01 08 09 10 11 18 19
1588  //  02 03 0a 0b 12 13 1a 1b
1589  //  04 05 0c 0d 14 15 1c 1d
1590  //  06 07 0e 0f 16 17 1e 1f
1591  //  20 21 28 29 30 31 38 39
1592  //  22 23 2a 2b 32 33 3a 3b
1593  //  24 25 2c 2d 34 35 3c 3d
1594  //  26 27 2e 2f 36 37 3e 3f
ARmpRed8x8(AU1 a)1595  AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
1596 #endif
1597 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1598 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1599 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1600 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1601 //_____________________________________________________________/\_______________________________________________________________
1602 //==============================================================================================================================
1603 //
1604 //                                                          REFERENCE
1605 //
1606 //------------------------------------------------------------------------------------------------------------------------------
1607 // IEEE FLOAT RULES
1608 // ================
1609 //  - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1
1610 //  - {+/-}0 * {+/-}INF = NaN
1611 //  - -INF + (+INF) = NaN
1612 //  - {+/-}0 / {+/-}0 = NaN
1613 //  - {+/-}INF / {+/-}INF = NaN
1614 //  - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN)
1615 //  - 0 == -0
1616 //  - 4/0 = +INF
1617 //  - 4/-0 = -INF
1618 //  - 4+INF = +INF
1619 //  - 4-INF = -INF
1620 //  - 4*(+INF) = +INF
1621 //  - 4*(-INF) = -INF
1622 //  - -4*(+INF) = -INF
1623 //  - sqrt(+INF) = +INF
1624 //------------------------------------------------------------------------------------------------------------------------------
1625 // FP16 ENCODING
1626 // =============
1627 // fedcba9876543210
1628 // ----------------
1629 // ......mmmmmmmmmm  10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals)
1630 // .eeeee..........  5-bit exponent
1631 // .00000..........  denormals
1632 // .00001..........  -14 exponent
1633 // .11110..........   15 exponent
1634 // .111110000000000  infinity
1635 // .11111nnnnnnnnnn  NaN with n!=0
1636 // s...............  sign
1637 //------------------------------------------------------------------------------------------------------------------------------
1638 // FP16/INT16 ALIASING DENORMAL
1639 // ============================
1640 // 11-bit unsigned integers alias with half float denormal/normal values,
1641 //     1 = 2^(-24) = 1/16777216 ....................... first denormal value
1642 //     2 = 2^(-23)
1643 //   ...
1644 //  1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
1645 //  1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
1646 //  2047 .............................................. last normal value that still maps to integers
1647 // Scaling limits,
1648 //  2^15 = 32768 ...................................... largest power of 2 scaling
1649 // Largest pow2 conversion mapping is at *32768,
1650 //     1 : 2^(-9) = 1/128
1651 //  1024 : 8
1652 //  2047 : a little less than 16
1653 //==============================================================================================================================
1654 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1655 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1656 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1657 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1658 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1659 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1660 //_____________________________________________________________/\_______________________________________________________________
1661 //==============================================================================================================================
1662 //
1663 //
1664 //                                                     GPU/CPU PORTABILITY
1665 //
1666 //
1667 //------------------------------------------------------------------------------------------------------------------------------
1668 // This is the GPU implementation.
1669 // See the CPU implementation for docs.
1670 //==============================================================================================================================
1671 #ifdef A_GPU
1672  #define A_TRUE true
1673  #define A_FALSE false
1674  #define A_STATIC
1675 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1676 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1677 //_____________________________________________________________/\_______________________________________________________________
1678 //==============================================================================================================================
1679 //                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
1680 //==============================================================================================================================
1681  #define retAD2 AD2
1682  #define retAD3 AD3
1683  #define retAD4 AD4
1684  #define retAF2 AF2
1685  #define retAF3 AF3
1686  #define retAF4 AF4
1687  #define retAL2 AL2
1688  #define retAL3 AL3
1689  #define retAL4 AL4
1690  #define retAU2 AU2
1691  #define retAU3 AU3
1692  #define retAU4 AU4
1693 //------------------------------------------------------------------------------------------------------------------------------
1694  #define inAD2 in AD2
1695  #define inAD3 in AD3
1696  #define inAD4 in AD4
1697  #define inAF2 in AF2
1698  #define inAF3 in AF3
1699  #define inAF4 in AF4
1700  #define inAL2 in AL2
1701  #define inAL3 in AL3
1702  #define inAL4 in AL4
1703  #define inAU2 in AU2
1704  #define inAU3 in AU3
1705  #define inAU4 in AU4
1706 //------------------------------------------------------------------------------------------------------------------------------
1707  #define inoutAD2 inout AD2
1708  #define inoutAD3 inout AD3
1709  #define inoutAD4 inout AD4
1710  #define inoutAF2 inout AF2
1711  #define inoutAF3 inout AF3
1712  #define inoutAF4 inout AF4
1713  #define inoutAL2 inout AL2
1714  #define inoutAL3 inout AL3
1715  #define inoutAL4 inout AL4
1716  #define inoutAU2 inout AU2
1717  #define inoutAU3 inout AU3
1718  #define inoutAU4 inout AU4
1719 //------------------------------------------------------------------------------------------------------------------------------
1720  #define outAD2 out AD2
1721  #define outAD3 out AD3
1722  #define outAD4 out AD4
1723  #define outAF2 out AF2
1724  #define outAF3 out AF3
1725  #define outAF4 out AF4
1726  #define outAL2 out AL2
1727  #define outAL3 out AL3
1728  #define outAL4 out AL4
1729  #define outAU2 out AU2
1730  #define outAU3 out AU3
1731  #define outAU4 out AU4
1732 //------------------------------------------------------------------------------------------------------------------------------
1733  #define varAD2(x) AD2 x
1734  #define varAD3(x) AD3 x
1735  #define varAD4(x) AD4 x
1736  #define varAF2(x) AF2 x
1737  #define varAF3(x) AF3 x
1738  #define varAF4(x) AF4 x
1739  #define varAL2(x) AL2 x
1740  #define varAL3(x) AL3 x
1741  #define varAL4(x) AL4 x
1742  #define varAU2(x) AU2 x
1743  #define varAU3(x) AU3 x
1744  #define varAU4(x) AU4 x
1745 //------------------------------------------------------------------------------------------------------------------------------
1746  #define initAD2(x,y) AD2(x,y)
1747  #define initAD3(x,y,z) AD3(x,y,z)
1748  #define initAD4(x,y,z,w) AD4(x,y,z,w)
1749  #define initAF2(x,y) AF2(x,y)
1750  #define initAF3(x,y,z) AF3(x,y,z)
1751  #define initAF4(x,y,z,w) AF4(x,y,z,w)
1752  #define initAL2(x,y) AL2(x,y)
1753  #define initAL3(x,y,z) AL3(x,y,z)
1754  #define initAL4(x,y,z,w) AL4(x,y,z,w)
1755  #define initAU2(x,y) AU2(x,y)
1756  #define initAU3(x,y,z) AU3(x,y,z)
1757  #define initAU4(x,y,z,w) AU4(x,y,z,w)
1758 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1759 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1760 //_____________________________________________________________/\_______________________________________________________________
1761 //==============================================================================================================================
1762 //                                                     SCALAR RETURN OPS
1763 //==============================================================================================================================
1764  #define AAbsD1(a) abs(AD1(a))
1765  #define AAbsF1(a) abs(AF1(a))
1766 //------------------------------------------------------------------------------------------------------------------------------
1767  #define ACosD1(a) cos(AD1(a))
1768  #define ACosF1(a) cos(AF1(a))
1769 //------------------------------------------------------------------------------------------------------------------------------
1770  #define ADotD2(a,b) dot(AD2(a),AD2(b))
1771  #define ADotD3(a,b) dot(AD3(a),AD3(b))
1772  #define ADotD4(a,b) dot(AD4(a),AD4(b))
1773  #define ADotF2(a,b) dot(AF2(a),AF2(b))
1774  #define ADotF3(a,b) dot(AF3(a),AF3(b))
1775  #define ADotF4(a,b) dot(AF4(a),AF4(b))
1776 //------------------------------------------------------------------------------------------------------------------------------
1777  #define AExp2D1(a) exp2(AD1(a))
1778  #define AExp2F1(a) exp2(AF1(a))
1779 //------------------------------------------------------------------------------------------------------------------------------
1780  #define AFloorD1(a) floor(AD1(a))
1781  #define AFloorF1(a) floor(AF1(a))
1782 //------------------------------------------------------------------------------------------------------------------------------
1783  #define ALog2D1(a) log2(AD1(a))
1784  #define ALog2F1(a) log2(AF1(a))
1785 //------------------------------------------------------------------------------------------------------------------------------
1786  #define AMaxD1(a,b) min(a,b)
1787  #define AMaxF1(a,b) min(a,b)
1788  #define AMaxL1(a,b) min(a,b)
1789  #define AMaxU1(a,b) min(a,b)
1790 //------------------------------------------------------------------------------------------------------------------------------
1791  #define AMinD1(a,b) min(a,b)
1792  #define AMinF1(a,b) min(a,b)
1793  #define AMinL1(a,b) min(a,b)
1794  #define AMinU1(a,b) min(a,b)
1795 //------------------------------------------------------------------------------------------------------------------------------
1796  #define ASinD1(a) sin(AD1(a))
1797  #define ASinF1(a) sin(AF1(a))
1798 //------------------------------------------------------------------------------------------------------------------------------
1799  #define ASqrtD1(a) sqrt(AD1(a))
1800  #define ASqrtF1(a) sqrt(AF1(a))
1801 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1802 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1803 //_____________________________________________________________/\_______________________________________________________________
1804 //==============================================================================================================================
1805 //                                               SCALAR RETURN OPS - DEPENDENT
1806 //==============================================================================================================================
1807  #define APowD1(a,b) pow(AD1(a),AF1(b))
1808  #define APowF1(a,b) pow(AF1(a),AF1(b))
1809 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1810 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1811 //_____________________________________________________________/\_______________________________________________________________
1812 //==============================================================================================================================
1813 //                                                         VECTOR OPS
1814 //------------------------------------------------------------------------------------------------------------------------------
1815 // These are added as needed for production or prototyping, so not necessarily a complete set.
1816 // They follow a convention of taking in a destination and also returning the destination value to increase utility.
1817 //==============================================================================================================================
1818  #ifdef A_DUBL
opAAbsD2(outAD2 d,inAD2 a)1819   AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;}
opAAbsD3(outAD3 d,inAD3 a)1820   AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;}
opAAbsD4(outAD4 d,inAD4 a)1821   AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;}
1822 //------------------------------------------------------------------------------------------------------------------------------
opAAddD2(outAD2 d,inAD2 a,inAD2 b)1823   AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;}
opAAddD3(outAD3 d,inAD3 a,inAD3 b)1824   AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;}
opAAddD4(outAD4 d,inAD4 a,inAD4 b)1825   AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;}
1826 //------------------------------------------------------------------------------------------------------------------------------
opACpyD2(outAD2 d,inAD2 a)1827   AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;}
opACpyD3(outAD3 d,inAD3 a)1828   AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;}
opACpyD4(outAD4 d,inAD4 a)1829   AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;}
1830 //------------------------------------------------------------------------------------------------------------------------------
opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c)1831   AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;}
opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c)1832   AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;}
opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c)1833   AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;}
1834 //------------------------------------------------------------------------------------------------------------------------------
opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c)1835   AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;}
opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c)1836   AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;}
opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c)1837   AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;}
1838 //------------------------------------------------------------------------------------------------------------------------------
opAMaxD2(outAD2 d,inAD2 a,inAD2 b)1839   AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;}
opAMaxD3(outAD3 d,inAD3 a,inAD3 b)1840   AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;}
opAMaxD4(outAD4 d,inAD4 a,inAD4 b)1841   AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;}
1842 //------------------------------------------------------------------------------------------------------------------------------
opAMinD2(outAD2 d,inAD2 a,inAD2 b)1843   AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;}
opAMinD3(outAD3 d,inAD3 a,inAD3 b)1844   AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;}
opAMinD4(outAD4 d,inAD4 a,inAD4 b)1845   AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;}
1846 //------------------------------------------------------------------------------------------------------------------------------
opAMulD2(outAD2 d,inAD2 a,inAD2 b)1847   AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;}
opAMulD3(outAD3 d,inAD3 a,inAD3 b)1848   AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;}
opAMulD4(outAD4 d,inAD4 a,inAD4 b)1849   AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;}
1850 //------------------------------------------------------------------------------------------------------------------------------
opAMulOneD2(outAD2 d,inAD2 a,AD1 b)1851   AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;}
opAMulOneD3(outAD3 d,inAD3 a,AD1 b)1852   AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;}
opAMulOneD4(outAD4 d,inAD4 a,AD1 b)1853   AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;}
1854 //------------------------------------------------------------------------------------------------------------------------------
opANegD2(outAD2 d,inAD2 a)1855   AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;}
opANegD3(outAD3 d,inAD3 a)1856   AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;}
opANegD4(outAD4 d,inAD4 a)1857   AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;}
1858 //------------------------------------------------------------------------------------------------------------------------------
opARcpD2(outAD2 d,inAD2 a)1859   AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;}
opARcpD3(outAD3 d,inAD3 a)1860   AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;}
opARcpD4(outAD4 d,inAD4 a)1861   AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;}
1862  #endif
1863 //==============================================================================================================================
opAAbsF2(outAF2 d,inAF2 a)1864  AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;}
opAAbsF3(outAF3 d,inAF3 a)1865  AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;}
opAAbsF4(outAF4 d,inAF4 a)1866  AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;}
1867 //------------------------------------------------------------------------------------------------------------------------------
opAAddF2(outAF2 d,inAF2 a,inAF2 b)1868  AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;}
opAAddF3(outAF3 d,inAF3 a,inAF3 b)1869  AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;}
opAAddF4(outAF4 d,inAF4 a,inAF4 b)1870  AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;}
1871 //------------------------------------------------------------------------------------------------------------------------------
opACpyF2(outAF2 d,inAF2 a)1872  AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;}
opACpyF3(outAF3 d,inAF3 a)1873  AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;}
opACpyF4(outAF4 d,inAF4 a)1874  AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;}
1875 //------------------------------------------------------------------------------------------------------------------------------
opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c)1876  AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;}
opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c)1877  AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;}
opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c)1878  AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;}
1879 //------------------------------------------------------------------------------------------------------------------------------
opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c)1880  AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;}
opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c)1881  AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;}
opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c)1882  AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;}
1883 //------------------------------------------------------------------------------------------------------------------------------
opAMaxF2(outAF2 d,inAF2 a,inAF2 b)1884  AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;}
opAMaxF3(outAF3 d,inAF3 a,inAF3 b)1885  AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;}
opAMaxF4(outAF4 d,inAF4 a,inAF4 b)1886  AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;}
1887 //------------------------------------------------------------------------------------------------------------------------------
opAMinF2(outAF2 d,inAF2 a,inAF2 b)1888  AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;}
opAMinF3(outAF3 d,inAF3 a,inAF3 b)1889  AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;}
opAMinF4(outAF4 d,inAF4 a,inAF4 b)1890  AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;}
1891 //------------------------------------------------------------------------------------------------------------------------------
opAMulF2(outAF2 d,inAF2 a,inAF2 b)1892  AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;}
opAMulF3(outAF3 d,inAF3 a,inAF3 b)1893  AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;}
opAMulF4(outAF4 d,inAF4 a,inAF4 b)1894  AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;}
1895 //------------------------------------------------------------------------------------------------------------------------------
opAMulOneF2(outAF2 d,inAF2 a,AF1 b)1896  AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;}
opAMulOneF3(outAF3 d,inAF3 a,AF1 b)1897  AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;}
opAMulOneF4(outAF4 d,inAF4 a,AF1 b)1898  AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;}
1899 //------------------------------------------------------------------------------------------------------------------------------
opANegF2(outAF2 d,inAF2 a)1900  AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;}
opANegF3(outAF3 d,inAF3 a)1901  AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;}
opANegF4(outAF4 d,inAF4 a)1902  AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;}
1903 //------------------------------------------------------------------------------------------------------------------------------
opARcpF2(outAF2 d,inAF2 a)1904  AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;}
opARcpF3(outAF3 d,inAF3 a)1905  AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;}
opARcpF4(outAF4 d,inAF4 a)1906  AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;}
1907 #endif
1908