1 //_____________________________________________________________/\_______________________________________________________________ 2 //============================================================================================================================== 3 // 4 // [A] SHADER PORTABILITY 1.20190530 5 // 6 //============================================================================================================================== 7 // LICENSE 8 // ======= 9 // Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved. 10 // Copyright (c) <2014> <Michal Drobot> 11 // ------- 12 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 13 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 14 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 15 // Software is furnished to do so, subject to the following conditions: 16 // ------- 17 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the 18 // Software. 19 // ------- 20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 21 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 22 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 23 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 //------------------------------------------------------------------------------------------------------------------------------ 25 // ABOUT 26 // ===== 27 // Common central point for high-level shading language and C portability for various shader headers. 28 //------------------------------------------------------------------------------------------------------------------------------ 29 // DEFINES 30 // ======= 31 // A_CPU ..... Include the CPU related code. 32 // A_GPU ..... Include the GPU related code. 33 // A_GLSL .... Using GLSL. 34 // A_HLSL .... Using HLSL. 35 // A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). 36 // ======= 37 // A_BYTE .... Support 8-bit integer. 38 // A_HALF .... Support 16-bit integer and floating point. 39 // A_LONG .... Support 64-bit integer. 40 // A_DUBL .... Support 64-bit floating point. 41 // ======= 42 // A_WAVE .... Support wave-wide operations. 43 //------------------------------------------------------------------------------------------------------------------------------ 44 // To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. 45 //------------------------------------------------------------------------------------------------------------------------------ 46 // SIMPLIFIED TYPE SYSTEM 47 // ====================== 48 // - All ints will be unsigned with exception of when signed is required. 49 // - Type naming simplified and shortened "A<type><#components>", 50 // - H = 16-bit float (half) 51 // - F = 32-bit float (float) 52 // - D = 64-bit float (double) 53 // - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) 54 // - B = 8-bit integer (byte) 55 // - W = 16-bit integer (word) 56 // - U = 32-bit integer (unsigned) 57 // - L = 64-bit integer (long) 58 // - Using "AS<type><#components>" for signed when required. 59 //------------------------------------------------------------------------------------------------------------------------------ 60 // TODO 61 // ==== 62 // - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). 63 // - Add subgroup ops. 64 //------------------------------------------------------------------------------------------------------------------------------ 65 // CHANGE LOG 66 // ========== 67 // 20190531 - Fixed changed to llabs() because long is int on Windows. 68 // 20190530 - Updated for new CPU/GPU portability. 69 // 20190528 - Fix AU1_AH2_x() on HLSL (had incorrectly swapped x and y), fixed asuint() cases. 70 // 20190527 - Added min3/max3 for low precision for HLSL. 71 // 20190526 - Updated with half approximations, added ARsq*(), and ASat*() for CPU. 72 // 20190519 - Added more approximations. 73 // 20190514 - Added long conversions. 74 // 20190513 - Added the real BFI moved the other one to ABfiM(). 75 // 20190507 - Added extra remap useful for 2D reductions. 76 // 20190507 - Started adding wave ops, add parabolic sin/cos. 77 // 20190505 - Added ASigned*() and friends, setup more auto-typecast, GLSL extensions, etc. 78 // 20190504 - Added min3/max3 for 32-bit integers. 79 // 20190503 - Added type reinterpretation for half. 80 // 20190416 - Added min3/max3 for half. 81 // 20190405 - Misc bug fixing. 82 // 20190404 - Cleaned up color conversion code. Switched "splat" to shorter naming "type_". Misc bug fixing. 83 //============================================================================================================================== 84 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 85 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 86 //_____________________________________________________________/\_______________________________________________________________ 87 //============================================================================================================================== 88 // COMMON 89 //============================================================================================================================== 90 #define A_2PI 6.28318530718 91 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 92 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 93 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 94 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 95 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 96 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 97 //_____________________________________________________________/\_______________________________________________________________ 98 //============================================================================================================================== 99 // 100 // 101 // CPU 102 // 103 // 104 //============================================================================================================================== 105 // Requires standard C types: stdint.h 106 // Requires a collection of standard math intrinsics. 107 // - Requires VS2013 when not using GCC to get exp2() and log2(). 108 // - https://blogs.msdn.microsoft.com/vcblog/2013/07/19/c99-library-support-in-visual-studio-2013/ 109 //------------------------------------------------------------------------------------------------------------------------------ 110 // This provides a minimum subset of functionality compared to the GPU parts. 111 //============================================================================================================================== 112 #ifdef A_CPU 113 // Supporting user defined overrides. 114 #ifndef A_RESTRICT 115 #define A_RESTRICT __restrict 116 #endif 117 //------------------------------------------------------------------------------------------------------------------------------ 118 #ifndef A_STATIC 119 #define A_STATIC static 120 #endif 121 //------------------------------------------------------------------------------------------------------------------------------ 122 // Same types across CPU and GPU. 123 // Predicate uses 32-bit integer (C friendly bool). 124 typedef uint32_t AP1; 125 typedef float AF1; 126 typedef double AD1; 127 typedef uint8_t AB1; 128 typedef uint16_t AW1; 129 typedef uint32_t AU1; 130 typedef uint64_t AL1; 131 typedef int8_t ASB1; 132 typedef int16_t ASW1; 133 typedef int32_t ASU1; 134 typedef int64_t ASL1; 135 //------------------------------------------------------------------------------------------------------------------------------ 136 #define AD1_(a) ((AD1)(a)) 137 #define AF1_(a) ((AF1)(a)) 138 #define AL1_(a) ((AL1)(a)) 139 #define AU1_(a) ((AU1)(a)) 140 //------------------------------------------------------------------------------------------------------------------------------ 141 #define ASL1_(a) ((ASL1)(a)) 142 #define ASU1_(a) ((ASU1)(a)) 143 //------------------------------------------------------------------------------------------------------------------------------ AU1_AF1(AF1 a)144 A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} 145 //------------------------------------------------------------------------------------------------------------------------------ 146 #define A_TRUE 1 147 #define A_FALSE 0 148 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 149 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 150 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 151 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 152 //_____________________________________________________________/\_______________________________________________________________ 153 //============================================================================================================================== 154 // 155 // CPU/GPU PORTING 156 // 157 //------------------------------------------------------------------------------------------------------------------------------ 158 // Hackary to get CPU and GPU to share all setup code, without duplicate code paths. 159 // Unfortunately this is the level of "ugly" that is required since the languages are very different. 160 // This uses a lower-case prefix for special vector constructs. 161 // - In C restrict pointers are used. 162 // - In the shading language, in/inout/out arguments are used. 163 // This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). 164 //============================================================================================================================== 165 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 166 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 167 //_____________________________________________________________/\_______________________________________________________________ 168 //============================================================================================================================== 169 // VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY 170 //============================================================================================================================== 171 #define retAD2 AD1 *A_RESTRICT 172 #define retAD3 AD1 *A_RESTRICT 173 #define retAD4 AD1 *A_RESTRICT 174 #define retAF2 AF1 *A_RESTRICT 175 #define retAF3 AF1 *A_RESTRICT 176 #define retAF4 AF1 *A_RESTRICT 177 #define retAL2 AL1 *A_RESTRICT 178 #define retAL3 AL1 *A_RESTRICT 179 #define retAL4 AL1 *A_RESTRICT 180 #define retAU2 AU1 *A_RESTRICT 181 #define retAU3 AU1 *A_RESTRICT 182 #define retAU4 AU1 *A_RESTRICT 183 //------------------------------------------------------------------------------------------------------------------------------ 184 #define inAD2 AD1 *A_RESTRICT 185 #define inAD3 AD1 *A_RESTRICT 186 #define inAD4 AD1 *A_RESTRICT 187 #define inAF2 AF1 *A_RESTRICT 188 #define inAF3 AF1 *A_RESTRICT 189 #define inAF4 AF1 *A_RESTRICT 190 #define inAL2 AL1 *A_RESTRICT 191 #define inAL3 AL1 *A_RESTRICT 192 #define inAL4 AL1 *A_RESTRICT 193 #define inAU2 AU1 *A_RESTRICT 194 #define inAU3 AU1 *A_RESTRICT 195 #define inAU4 AU1 *A_RESTRICT 196 //------------------------------------------------------------------------------------------------------------------------------ 197 #define inoutAD2 AD1 *A_RESTRICT 198 #define inoutAD3 AD1 *A_RESTRICT 199 #define inoutAD4 AD1 *A_RESTRICT 200 #define inoutAF2 AF1 *A_RESTRICT 201 #define inoutAF3 AF1 *A_RESTRICT 202 #define inoutAF4 AF1 *A_RESTRICT 203 #define inoutAL2 AL1 *A_RESTRICT 204 #define inoutAL3 AL1 *A_RESTRICT 205 #define inoutAL4 AL1 *A_RESTRICT 206 #define inoutAU2 AU1 *A_RESTRICT 207 #define inoutAU3 AU1 *A_RESTRICT 208 #define inoutAU4 AU1 *A_RESTRICT 209 //------------------------------------------------------------------------------------------------------------------------------ 210 #define outAD2 AD1 *A_RESTRICT 211 #define outAD3 AD1 *A_RESTRICT 212 #define outAD4 AD1 *A_RESTRICT 213 #define outAF2 AF1 *A_RESTRICT 214 #define outAF3 AF1 *A_RESTRICT 215 #define outAF4 AF1 *A_RESTRICT 216 #define outAL2 AL1 *A_RESTRICT 217 #define outAL3 AL1 *A_RESTRICT 218 #define outAL4 AL1 *A_RESTRICT 219 #define outAU2 AU1 *A_RESTRICT 220 #define outAU3 AU1 *A_RESTRICT 221 #define outAU4 AU1 *A_RESTRICT 222 //------------------------------------------------------------------------------------------------------------------------------ 223 #define varAD2(x) AD1 x[2] 224 #define varAD3(x) AD1 x[3] 225 #define varAD4(x) AD1 x[4] 226 #define varAF2(x) AF1 x[2] 227 #define varAF3(x) AF1 x[3] 228 #define varAF4(x) AF1 x[4] 229 #define varAL2(x) AL1 x[2] 230 #define varAL3(x) AL1 x[3] 231 #define varAL4(x) AL1 x[4] 232 #define varAU2(x) AU1 x[2] 233 #define varAU3(x) AU1 x[3] 234 #define varAU4(x) AU1 x[4] 235 //------------------------------------------------------------------------------------------------------------------------------ 236 #define initAD2(x,y) {x,y} 237 #define initAD3(x,y,z) {x,y,z} 238 #define initAD4(x,y,z,w) {x,y,z,w} 239 #define initAF2(x,y) {x,y} 240 #define initAF3(x,y,z) {x,y,z} 241 #define initAF4(x,y,z,w) {x,y,z,w} 242 #define initAL2(x,y) {x,y} 243 #define initAL3(x,y,z) {x,y,z} 244 #define initAL4(x,y,z,w) {x,y,z,w} 245 #define initAU2(x,y) {x,y} 246 #define initAU3(x,y,z) {x,y,z} 247 #define initAU4(x,y,z,w) {x,y,z,w} 248 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 249 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 250 //_____________________________________________________________/\_______________________________________________________________ 251 //============================================================================================================================== 252 // SCALAR RETURN OPS 253 //------------------------------------------------------------------------------------------------------------------------------ 254 // TODO 255 // ==== 256 // - Replace transcendentals with manual versions. 257 //============================================================================================================================== 258 #ifdef A_GCC AAbsD1(AD1 a)259 A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} AAbsF1(AF1 a)260 A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} AAbsSU1(AU1 a)261 A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} AAbsSL1(AL1 a)262 A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_labs(ASL1_(a)));} 263 #else AAbsD1(AD1 a)264 A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} AAbsF1(AF1 a)265 A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} AAbsSU1(AU1 a)266 A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} AAbsSL1(AL1 a)267 A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(llabs(ASL1_(a)));} 268 #endif 269 //------------------------------------------------------------------------------------------------------------------------------ 270 #ifdef A_GCC ACosD1(AD1 a)271 A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} ACosF1(AF1 a)272 A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} 273 #else ACosD1(AD1 a)274 A_STATIC AD1 ACosD1(AD1 a){return cos(a);} ACosF1(AF1 a)275 A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} 276 #endif 277 //------------------------------------------------------------------------------------------------------------------------------ ADotD2(inAD2 a,inAD2 b)278 A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} ADotD3(inAD3 a,inAD3 b)279 A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} ADotD4(inAD4 a,inAD4 b)280 A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} ADotF2(inAF2 a,inAF2 b)281 A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} ADotF3(inAF3 a,inAF3 b)282 A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} ADotF4(inAF4 a,inAF4 b)283 A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} 284 //------------------------------------------------------------------------------------------------------------------------------ 285 #ifdef A_GCC AExp2D1(AD1 a)286 A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} AExp2F1(AF1 a)287 A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} 288 #else AExp2D1(AD1 a)289 A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} AExp2F1(AF1 a)290 A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} 291 #endif 292 //------------------------------------------------------------------------------------------------------------------------------ 293 #ifdef A_GCC AFloorD1(AD1 a)294 A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} AFloorF1(AF1 a)295 A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} 296 #else AFloorD1(AD1 a)297 A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} AFloorF1(AF1 a)298 A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} 299 #endif 300 //------------------------------------------------------------------------------------------------------------------------------ ALerpD1(AD1 a,AD1 b,AD1 c)301 A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} ALerpF1(AF1 a,AF1 b,AF1 c)302 A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} 303 //------------------------------------------------------------------------------------------------------------------------------ 304 #ifdef A_GCC ALog2D1(AD1 a)305 A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} ALog2F1(AF1 a)306 A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} 307 #else ALog2D1(AD1 a)308 A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} ALog2F1(AF1 a)309 A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} 310 #endif 311 //------------------------------------------------------------------------------------------------------------------------------ AMaxD1(AD1 a,AD1 b)312 A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} AMaxF1(AF1 a,AF1 b)313 A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} AMaxL1(AL1 a,AL1 b)314 A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} AMaxU1(AU1 a,AU1 b)315 A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} 316 //------------------------------------------------------------------------------------------------------------------------------ 317 // These follow the convention that A integer types don't have signage, until they are operated on. AMaxSL1(AL1 a,AL1 b)318 A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} AMaxSU1(AU1 a,AU1 b)319 A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} 320 //------------------------------------------------------------------------------------------------------------------------------ AMinD1(AD1 a,AD1 b)321 A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;} AMinF1(AF1 a,AF1 b)322 A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;} AMinL1(AL1 a,AL1 b)323 A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;} AMinU1(AU1 a,AU1 b)324 A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;} 325 //------------------------------------------------------------------------------------------------------------------------------ AMinSL1(AL1 a,AL1 b)326 A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;} AMinSU1(AU1 a,AU1 b)327 A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;} 328 //------------------------------------------------------------------------------------------------------------------------------ ARcpD1(AD1 a)329 A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;} ARcpF1(AF1 a)330 A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;} 331 //------------------------------------------------------------------------------------------------------------------------------ AShrSL1(AL1 a,AL1 b)332 A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));} AShrSU1(AU1 a,AU1 b)333 A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} 334 //------------------------------------------------------------------------------------------------------------------------------ 335 #ifdef A_GCC ASinD1(AD1 a)336 A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} ASinF1(AF1 a)337 A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} 338 #else ASinD1(AD1 a)339 A_STATIC AD1 ASinD1(AD1 a){return sin(a);} ASinF1(AF1 a)340 A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} 341 #endif 342 //------------------------------------------------------------------------------------------------------------------------------ 343 #ifdef A_GCC ASqrtD1(AD1 a)344 A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} ASqrtF1(AF1 a)345 A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} 346 #else ASqrtD1(AD1 a)347 A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} ASqrtF1(AF1 a)348 A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} 349 #endif 350 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 351 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 352 //_____________________________________________________________/\_______________________________________________________________ 353 //============================================================================================================================== 354 // SCALAR RETURN OPS - DEPENDENT 355 //============================================================================================================================== AFractD1(AD1 a)356 A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} AFractF1(AF1 a)357 A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} 358 //------------------------------------------------------------------------------------------------------------------------------ APowD1(AD1 a,AD1 b)359 A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} APowF1(AF1 a,AF1 b)360 A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} 361 //------------------------------------------------------------------------------------------------------------------------------ ARsqD1(AD1 a)362 A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} ARsqF1(AF1 a)363 A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} 364 //------------------------------------------------------------------------------------------------------------------------------ ASatD1(AD1 a)365 A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} ASatF1(AF1 a)366 A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} 367 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 368 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 369 //_____________________________________________________________/\_______________________________________________________________ 370 //============================================================================================================================== 371 // VECTOR OPS 372 //------------------------------------------------------------------------------------------------------------------------------ 373 // These are added as needed for production or prototyping, so not necessarily a complete set. 374 // They follow a convention of taking in a destination and also returning the destination value to increase utility. 375 //============================================================================================================================== opAAbsD2(outAD2 d,inAD2 a)376 A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} opAAbsD3(outAD3 d,inAD3 a)377 A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} opAAbsD4(outAD4 d,inAD4 a)378 A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} 379 //------------------------------------------------------------------------------------------------------------------------------ opAAbsF2(outAF2 d,inAF2 a)380 A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} opAAbsF3(outAF3 d,inAF3 a)381 A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} opAAbsF4(outAF4 d,inAF4 a)382 A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} 383 //============================================================================================================================== opAAddD2(outAD2 d,inAD2 a,inAD2 b)384 A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} opAAddD3(outAD3 d,inAD3 a,inAD3 b)385 A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} opAAddD4(outAD4 d,inAD4 a,inAD4 b)386 A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} 387 //------------------------------------------------------------------------------------------------------------------------------ opAAddF2(outAF2 d,inAF2 a,inAF2 b)388 A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} opAAddF3(outAF3 d,inAF3 a,inAF3 b)389 A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} opAAddF4(outAF4 d,inAF4 a,inAF4 b)390 A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} 391 //============================================================================================================================== opACpyD2(outAD2 d,inAD2 a)392 A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} opACpyD3(outAD3 d,inAD3 a)393 A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} opACpyD4(outAD4 d,inAD4 a)394 A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} 395 //------------------------------------------------------------------------------------------------------------------------------ opACpyF2(outAF2 d,inAF2 a)396 A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} opACpyF3(outAF3 d,inAF3 a)397 A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} opACpyF4(outAF4 d,inAF4 a)398 A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} 399 //============================================================================================================================== opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c)400 A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c)401 A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c)402 A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} 403 //------------------------------------------------------------------------------------------------------------------------------ opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c)404 A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c)405 A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c)406 A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} 407 //============================================================================================================================== opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c)408 A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c)409 A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c)410 A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} 411 //------------------------------------------------------------------------------------------------------------------------------ opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c)412 A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c)413 A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c)414 A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} 415 //============================================================================================================================== opAMaxD2(outAD2 d,inAD2 a,inAD2 b)416 A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} opAMaxD3(outAD3 d,inAD3 a,inAD3 b)417 A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} opAMaxD4(outAD4 d,inAD4 a,inAD4 b)418 A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} 419 //------------------------------------------------------------------------------------------------------------------------------ opAMaxF2(outAF2 d,inAF2 a,inAF2 b)420 A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} opAMaxF3(outAF3 d,inAF3 a,inAF3 b)421 A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} opAMaxF4(outAF4 d,inAF4 a,inAF4 b)422 A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} 423 //============================================================================================================================== opAMinD2(outAD2 d,inAD2 a,inAD2 b)424 A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} opAMinD3(outAD3 d,inAD3 a,inAD3 b)425 A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} opAMinD4(outAD4 d,inAD4 a,inAD4 b)426 A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} 427 //------------------------------------------------------------------------------------------------------------------------------ opAMinF2(outAF2 d,inAF2 a,inAF2 b)428 A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} opAMinF3(outAF3 d,inAF3 a,inAF3 b)429 A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} opAMinF4(outAF4 d,inAF4 a,inAF4 b)430 A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} 431 //============================================================================================================================== opAMulD2(outAD2 d,inAD2 a,inAD2 b)432 A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} opAMulD3(outAD3 d,inAD3 a,inAD3 b)433 A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} opAMulD4(outAD4 d,inAD4 a,inAD4 b)434 A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} 435 //------------------------------------------------------------------------------------------------------------------------------ opAMulF2(outAF2 d,inAF2 a,inAF2 b)436 A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} opAMulF3(outAF3 d,inAF3 a,inAF3 b)437 A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} opAMulF4(outAF4 d,inAF4 a,inAF4 b)438 A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} 439 //============================================================================================================================== opAMulOneD2(outAD2 d,inAD2 a,AD1 b)440 A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} opAMulOneD3(outAD3 d,inAD3 a,AD1 b)441 A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} opAMulOneD4(outAD4 d,inAD4 a,AD1 b)442 A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} 443 //------------------------------------------------------------------------------------------------------------------------------ opAMulOneF2(outAF2 d,inAF2 a,AF1 b)444 A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} opAMulOneF3(outAF3 d,inAF3 a,AF1 b)445 A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} opAMulOneF4(outAF4 d,inAF4 a,AF1 b)446 A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} 447 //============================================================================================================================== opANegD2(outAD2 d,inAD2 a)448 A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} opANegD3(outAD3 d,inAD3 a)449 A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} opANegD4(outAD4 d,inAD4 a)450 A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} 451 //------------------------------------------------------------------------------------------------------------------------------ opANegF2(outAF2 d,inAF2 a)452 A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} opANegF3(outAF3 d,inAF3 a)453 A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} opANegF4(outAF4 d,inAF4 a)454 A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} 455 //============================================================================================================================== opARcpD2(outAD2 d,inAD2 a)456 A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} opARcpD3(outAD3 d,inAD3 a)457 A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} opARcpD4(outAD4 d,inAD4 a)458 A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} 459 //------------------------------------------------------------------------------------------------------------------------------ opARcpF2(outAF2 d,inAF2 a)460 A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} opARcpF3(outAF3 d,inAF3 a)461 A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} opARcpF4(outAF4 d,inAF4 a)462 A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} 463 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 464 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 465 //_____________________________________________________________/\_______________________________________________________________ 466 //============================================================================================================================== 467 // HALF FLOAT PACKING 468 //============================================================================================================================== 469 // Convert float to half (in lower 16-bits of output). 470 // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf 471 // Supports denormals. 472 // Conversion rules are to make computations possibly "safer" on the GPU, 473 // -INF & -NaN -> -65504 474 // +INF & +NaN -> +65504 AU1_AH1_AF1(AF1 f)475 A_STATIC AU1 AU1_AH1_AF1(AF1 f){ 476 static AW1 base[512]={ 477 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 478 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 479 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 480 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 481 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 482 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 483 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, 484 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, 485 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, 486 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 487 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 488 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 489 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 490 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 491 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 492 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 493 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 494 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 495 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 496 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 497 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 498 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 499 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, 500 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, 501 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, 502 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 503 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 504 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 505 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 506 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 507 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 508 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; 509 static AB1 shift[512]={ 510 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 511 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 512 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 513 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 514 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 515 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 516 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, 517 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, 518 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, 519 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 520 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 521 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 522 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 523 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 524 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 525 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 526 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 527 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 528 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 529 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 530 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 531 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 532 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, 533 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, 534 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, 535 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 536 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 537 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 538 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 539 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 540 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 541 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; 542 union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} 543 //------------------------------------------------------------------------------------------------------------------------------ 544 // Used to output packed constant. AU1_AH2_AF2(inAF2 a)545 A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} 546 #endif 547 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 548 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 549 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 550 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 551 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 552 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 553 //_____________________________________________________________/\_______________________________________________________________ 554 //============================================================================================================================== 555 // 556 // 557 // GLSL 558 // 559 // 560 //============================================================================================================================== 561 #if defined(A_GLSL) && defined(A_GPU) 562 #ifndef A_SKIP_EXT 563 #ifdef A_HALF 564 #extension GL_EXT_shader_16bit_storage:require 565 #extension GL_EXT_shader_explicit_arithmetic_types:require 566 #endif 567 //------------------------------------------------------------------------------------------------------------------------------ 568 #ifdef A_LONG 569 #extension GL_ARB_gpu_shader_int64:require 570 // TODO: Fixme to more portable extension!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 571 #extension GL_NV_shader_atomic_int64:require 572 #endif 573 //------------------------------------------------------------------------------------------------------------------------------ 574 #ifdef A_WAVE 575 #extension GL_KHR_shader_subgroup_arithmetic:require 576 #extension GL_KHR_shader_subgroup_ballot:require 577 #extension GL_KHR_shader_subgroup_quad:require 578 #extension GL_KHR_shader_subgroup_shuffle:require 579 #endif 580 #endif 581 //============================================================================================================================== 582 #define AP1 bool 583 #define AP2 bvec2 584 #define AP3 bvec3 585 #define AP4 bvec4 586 //------------------------------------------------------------------------------------------------------------------------------ 587 #define AF1 float 588 #define AF2 vec2 589 #define AF3 vec3 590 #define AF4 vec4 591 //------------------------------------------------------------------------------------------------------------------------------ 592 #define AU1 uint 593 #define AU2 uvec2 594 #define AU3 uvec3 595 #define AU4 uvec4 596 //------------------------------------------------------------------------------------------------------------------------------ 597 #define ASU1 int 598 #define ASU2 ivec2 599 #define ASU3 ivec3 600 #define ASU4 ivec4 601 //============================================================================================================================== 602 #define AF1_AU1(x) uintBitsToFloat(AU1(x)) 603 #define AF2_AU2(x) uintBitsToFloat(AU2(x)) 604 #define AF3_AU3(x) uintBitsToFloat(AU3(x)) 605 #define AF4_AU4(x) uintBitsToFloat(AU4(x)) 606 //------------------------------------------------------------------------------------------------------------------------------ 607 #define AU1_AF1(x) floatBitsToUint(AF1(x)) 608 #define AU2_AF2(x) floatBitsToUint(AF2(x)) 609 #define AU3_AF3(x) floatBitsToUint(AF3(x)) 610 #define AU4_AF4(x) floatBitsToUint(AF4(x)) 611 //------------------------------------------------------------------------------------------------------------------------------ 612 #define AU1_AH2_AF2 packHalf2x16 613 #define AU1_AW2Unorm_AF2 packUnorm2x16 614 #define AU1_AB4Unorm_AF4 packUnorm4x8 615 //------------------------------------------------------------------------------------------------------------------------------ 616 #define AF2_AH2_AU1 unpackHalf2x16 617 #define AF2_AW2Unorm_AU1 unpackUnorm2x16 618 #define AF4_AB4Unorm_AU1 unpackUnorm4x8 619 //============================================================================================================================== AF1_x(AF1 a)620 AF1 AF1_x(AF1 a){return AF1(a);} AF2_x(AF1 a)621 AF2 AF2_x(AF1 a){return AF2(a,a);} AF3_x(AF1 a)622 AF3 AF3_x(AF1 a){return AF3(a,a,a);} AF4_x(AF1 a)623 AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} 624 #define AF1_(a) AF1_x(AF1(a)) 625 #define AF2_(a) AF2_x(AF1(a)) 626 #define AF3_(a) AF3_x(AF1(a)) 627 #define AF4_(a) AF4_x(AF1(a)) 628 //------------------------------------------------------------------------------------------------------------------------------ AU1_x(AU1 a)629 AU1 AU1_x(AU1 a){return AU1(a);} AU2_x(AU1 a)630 AU2 AU2_x(AU1 a){return AU2(a,a);} AU3_x(AU1 a)631 AU3 AU3_x(AU1 a){return AU3(a,a,a);} AU4_x(AU1 a)632 AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} 633 #define AU1_(a) AU1_x(AU1(a)) 634 #define AU2_(a) AU2_x(AU1(a)) 635 #define AU3_(a) AU3_x(AU1(a)) 636 #define AU4_(a) AU4_x(AU1(a)) 637 //============================================================================================================================== AAbsSU1(AU1 a)638 AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} AAbsSU2(AU2 a)639 AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} AAbsSU3(AU3 a)640 AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} AAbsSU4(AU4 a)641 AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} 642 //------------------------------------------------------------------------------------------------------------------------------ ABfe(AU1 src,AU1 off,AU1 bits)643 AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} ABfi(AU1 src,AU1 ins,AU1 mask)644 AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} 645 // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate. ABfiM(AU1 src,AU1 ins,AU1 bits)646 AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));} 647 //------------------------------------------------------------------------------------------------------------------------------ 648 // V_FRACT_F32 (note DX frac() is different). AFractF1(AF1 x)649 AF1 AFractF1(AF1 x){return fract(x);} AFractF2(AF2 x)650 AF2 AFractF2(AF2 x){return fract(x);} AFractF3(AF3 x)651 AF3 AFractF3(AF3 x){return fract(x);} AFractF4(AF4 x)652 AF4 AFractF4(AF4 x){return fract(x);} 653 //------------------------------------------------------------------------------------------------------------------------------ ALerpF1(AF1 x,AF1 y,AF1 a)654 AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);} ALerpF2(AF2 x,AF2 y,AF2 a)655 AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);} ALerpF3(AF3 x,AF3 y,AF3 a)656 AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);} ALerpF4(AF4 x,AF4 y,AF4 a)657 AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);} 658 //------------------------------------------------------------------------------------------------------------------------------ 659 // V_MAX3_F32. AMax3F1(AF1 x,AF1 y,AF1 z)660 AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} AMax3F2(AF2 x,AF2 y,AF2 z)661 AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} AMax3F3(AF3 x,AF3 y,AF3 z)662 AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} AMax3F4(AF4 x,AF4 y,AF4 z)663 AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} 664 //------------------------------------------------------------------------------------------------------------------------------ AMax3SU1(AU1 x,AU1 y,AU1 z)665 AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} AMax3SU2(AU2 x,AU2 y,AU2 z)666 AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} AMax3SU3(AU3 x,AU3 y,AU3 z)667 AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} AMax3SU4(AU4 x,AU4 y,AU4 z)668 AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} 669 //------------------------------------------------------------------------------------------------------------------------------ AMax3U1(AU1 x,AU1 y,AU1 z)670 AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} AMax3U2(AU2 x,AU2 y,AU2 z)671 AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} AMax3U3(AU3 x,AU3 y,AU3 z)672 AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} AMax3U4(AU4 x,AU4 y,AU4 z)673 AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} 674 //------------------------------------------------------------------------------------------------------------------------------ AMaxSU1(AU1 a,AU1 b)675 AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} AMaxSU2(AU2 a,AU2 b)676 AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} AMaxSU3(AU3 a,AU3 b)677 AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} AMaxSU4(AU4 a,AU4 b)678 AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} 679 //------------------------------------------------------------------------------------------------------------------------------ 680 // Clamp has an easier pattern match for med3 when some ordering is known. 681 // V_MED3_F32. AMed3F1(AF1 x,AF1 y,AF1 z)682 AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} AMed3F2(AF2 x,AF2 y,AF2 z)683 AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} AMed3F3(AF3 x,AF3 y,AF3 z)684 AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));} AMed3F4(AF4 x,AF4 y,AF4 z)685 AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));} 686 //------------------------------------------------------------------------------------------------------------------------------ 687 // V_MIN3_F32. AMin3F1(AF1 x,AF1 y,AF1 z)688 AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));} AMin3F2(AF2 x,AF2 y,AF2 z)689 AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));} AMin3F3(AF3 x,AF3 y,AF3 z)690 AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));} AMin3F4(AF4 x,AF4 y,AF4 z)691 AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));} 692 //------------------------------------------------------------------------------------------------------------------------------ AMin3SU1(AU1 x,AU1 y,AU1 z)693 AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));} AMin3SU2(AU2 x,AU2 y,AU2 z)694 AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));} AMin3SU3(AU3 x,AU3 y,AU3 z)695 AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));} AMin3SU4(AU4 x,AU4 y,AU4 z)696 AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));} 697 //------------------------------------------------------------------------------------------------------------------------------ AMin3U1(AU1 x,AU1 y,AU1 z)698 AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));} AMin3U2(AU2 x,AU2 y,AU2 z)699 AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));} AMin3U3(AU3 x,AU3 y,AU3 z)700 AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));} AMin3U4(AU4 x,AU4 y,AU4 z)701 AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));} 702 //------------------------------------------------------------------------------------------------------------------------------ AMinSU1(AU1 a,AU1 b)703 AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));} AMinSU2(AU2 a,AU2 b)704 AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));} AMinSU3(AU3 a,AU3 b)705 AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));} AMinSU4(AU4 a,AU4 b)706 AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));} 707 //------------------------------------------------------------------------------------------------------------------------------ 708 // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. 709 // V_COS_F32. ANCosF1(AF1 x)710 AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));} ANCosF2(AF2 x)711 AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));} ANCosF3(AF3 x)712 AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));} ANCosF4(AF4 x)713 AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));} 714 //------------------------------------------------------------------------------------------------------------------------------ 715 // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. 716 // V_SIN_F32. ANSinF1(AF1 x)717 AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));} ANSinF2(AF2 x)718 AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));} ANSinF3(AF3 x)719 AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));} ANSinF4(AF4 x)720 AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));} 721 //------------------------------------------------------------------------------------------------------------------------------ ARcpF1(AF1 x)722 AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;} ARcpF2(AF2 x)723 AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;} ARcpF3(AF3 x)724 AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;} ARcpF4(AF4 x)725 AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;} 726 //------------------------------------------------------------------------------------------------------------------------------ ARsqF1(AF1 x)727 AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);} ARsqF2(AF2 x)728 AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);} ARsqF3(AF3 x)729 AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);} ARsqF4(AF4 x)730 AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);} 731 //------------------------------------------------------------------------------------------------------------------------------ ASatF1(AF1 x)732 AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));} ASatF2(AF2 x)733 AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));} ASatF3(AF3 x)734 AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));} ASatF4(AF4 x)735 AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));} 736 //------------------------------------------------------------------------------------------------------------------------------ AShrSU1(AU1 a,AU1 b)737 AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));} AShrSU2(AU2 a,AU2 b)738 AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} AShrSU3(AU3 a,AU3 b)739 AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} AShrSU4(AU4 a,AU4 b)740 AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} 741 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 742 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 743 //_____________________________________________________________/\_______________________________________________________________ 744 //============================================================================================================================== 745 // GLSL BYTE 746 //============================================================================================================================== 747 #ifdef A_BYTE 748 #define AB1 uint8_t 749 #define AB2 u8vec2 750 #define AB3 u8vec3 751 #define AB4 u8vec4 752 //------------------------------------------------------------------------------------------------------------------------------ 753 #define ASB1 int8_t 754 #define ASB2 i8vec2 755 #define ASB3 i8vec3 756 #define ASB4 i8vec4 757 //------------------------------------------------------------------------------------------------------------------------------ AB1_x(AB1 a)758 AB1 AB1_x(AB1 a){return AB1(a);} AB2_x(AB1 a)759 AB2 AB2_x(AB1 a){return AB2(a,a);} AB3_x(AB1 a)760 AB3 AB3_x(AB1 a){return AB3(a,a,a);} AB4_x(AB1 a)761 AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} 762 #define AB1_(a) AB1_x(AB1(a)) 763 #define AB2_(a) AB2_x(AB1(a)) 764 #define AB3_(a) AB3_x(AB1(a)) 765 #define AB4_(a) AB4_x(AB1(a)) 766 #endif 767 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 768 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 769 //_____________________________________________________________/\_______________________________________________________________ 770 //============================================================================================================================== 771 // GLSL HALF 772 //============================================================================================================================== 773 #ifdef A_HALF 774 #define AH1 float16_t 775 #define AH2 f16vec2 776 #define AH3 f16vec3 777 #define AH4 f16vec4 778 //------------------------------------------------------------------------------------------------------------------------------ 779 #define AW1 uint16_t 780 #define AW2 u16vec2 781 #define AW3 u16vec3 782 #define AW4 u16vec4 783 //------------------------------------------------------------------------------------------------------------------------------ 784 #define ASW1 int16_t 785 #define ASW2 i16vec2 786 #define ASW3 i16vec3 787 #define ASW4 i16vec4 788 //============================================================================================================================== 789 #define AH2_AU1(x) unpackFloat2x16(AU1(x)) AH4_AU2_x(AU2 x)790 AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} 791 #define AH4_AU2(x) AH4_AU2_x(AU2(x)) 792 #define AW2_AU1(x) unpackUint2x16(AU1(x)) 793 #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) 794 //------------------------------------------------------------------------------------------------------------------------------ 795 #define AU1_AH2(x) packFloat2x16(AH2(x)) AU2_AH4_x(AH4 x)796 AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} 797 #define AU2_AH4(x) AU2_AH4_x(AH4(x)) 798 #define AU1_AW2(x) packUint2x16(AW2(x)) 799 #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) 800 //============================================================================================================================== 801 #define AW1_AH1(x) halfBitsToUint16(AH1(x)) 802 #define AW2_AH2(x) halfBitsToUint16(AH2(x)) 803 #define AW3_AH3(x) halfBitsToUint16(AH3(x)) 804 #define AW4_AH4(x) halfBitsToUint16(AH4(x)) 805 //------------------------------------------------------------------------------------------------------------------------------ 806 #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) 807 #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) 808 #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) 809 #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) 810 //============================================================================================================================== AH1_x(AH1 a)811 AH1 AH1_x(AH1 a){return AH1(a);} AH2_x(AH1 a)812 AH2 AH2_x(AH1 a){return AH2(a,a);} AH3_x(AH1 a)813 AH3 AH3_x(AH1 a){return AH3(a,a,a);} AH4_x(AH1 a)814 AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} 815 #define AH1_(a) AH1_x(AH1(a)) 816 #define AH2_(a) AH2_x(AH1(a)) 817 #define AH3_(a) AH3_x(AH1(a)) 818 #define AH4_(a) AH4_x(AH1(a)) 819 //------------------------------------------------------------------------------------------------------------------------------ AW1_x(AW1 a)820 AW1 AW1_x(AW1 a){return AW1(a);} AW2_x(AW1 a)821 AW2 AW2_x(AW1 a){return AW2(a,a);} AW3_x(AW1 a)822 AW3 AW3_x(AW1 a){return AW3(a,a,a);} AW4_x(AW1 a)823 AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} 824 #define AW1_(a) AW1_x(AW1(a)) 825 #define AW2_(a) AW2_x(AW1(a)) 826 #define AW3_(a) AW3_x(AW1(a)) 827 #define AW4_(a) AW4_x(AW1(a)) 828 //============================================================================================================================== AAbsSW1(AW1 a)829 AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} AAbsSW2(AW2 a)830 AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} AAbsSW3(AW3 a)831 AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} AAbsSW4(AW4 a)832 AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} 833 //------------------------------------------------------------------------------------------------------------------------------ AFractH1(AH1 x)834 AH1 AFractH1(AH1 x){return fract(x);} AFractH2(AH2 x)835 AH2 AFractH2(AH2 x){return fract(x);} AFractH3(AH3 x)836 AH3 AFractH3(AH3 x){return fract(x);} AFractH4(AH4 x)837 AH4 AFractH4(AH4 x){return fract(x);} 838 //------------------------------------------------------------------------------------------------------------------------------ ALerpH1(AH1 x,AH1 y,AH1 a)839 AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} ALerpH2(AH2 x,AH2 y,AH2 a)840 AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} ALerpH3(AH3 x,AH3 y,AH3 a)841 AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} ALerpH4(AH4 x,AH4 y,AH4 a)842 AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} 843 //------------------------------------------------------------------------------------------------------------------------------ 844 // No packed version of max3. AMax3H1(AH1 x,AH1 y,AH1 z)845 AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} AMax3H2(AH2 x,AH2 y,AH2 z)846 AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} AMax3H3(AH3 x,AH3 y,AH3 z)847 AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} AMax3H4(AH4 x,AH4 y,AH4 z)848 AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} 849 //------------------------------------------------------------------------------------------------------------------------------ AMaxSW1(AW1 a,AW1 b)850 AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} AMaxSW2(AW2 a,AW2 b)851 AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} AMaxSW3(AW3 a,AW3 b)852 AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} AMaxSW4(AW4 a,AW4 b)853 AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} 854 //------------------------------------------------------------------------------------------------------------------------------ 855 // No packed version of min3. AMin3H1(AH1 x,AH1 y,AH1 z)856 AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} AMin3H2(AH2 x,AH2 y,AH2 z)857 AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} AMin3H3(AH3 x,AH3 y,AH3 z)858 AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} AMin3H4(AH4 x,AH4 y,AH4 z)859 AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} 860 //------------------------------------------------------------------------------------------------------------------------------ AMinSW1(AW1 a,AW1 b)861 AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} AMinSW2(AW2 a,AW2 b)862 AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} AMinSW3(AW3 a,AW3 b)863 AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} AMinSW4(AW4 a,AW4 b)864 AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} 865 //------------------------------------------------------------------------------------------------------------------------------ ARcpH1(AH1 x)866 AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} ARcpH2(AH2 x)867 AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} ARcpH3(AH3 x)868 AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} ARcpH4(AH4 x)869 AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} 870 //------------------------------------------------------------------------------------------------------------------------------ ARsqH1(AH1 x)871 AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} ARsqH2(AH2 x)872 AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} ARsqH3(AH3 x)873 AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} ARsqH4(AH4 x)874 AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} 875 //------------------------------------------------------------------------------------------------------------------------------ ASatH1(AH1 x)876 AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} ASatH2(AH2 x)877 AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} ASatH3(AH3 x)878 AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} ASatH4(AH4 x)879 AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} 880 //------------------------------------------------------------------------------------------------------------------------------ AShrSW1(AW1 a,AW1 b)881 AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} AShrSW2(AW2 a,AW2 b)882 AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} AShrSW3(AW3 a,AW3 b)883 AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} AShrSW4(AW4 a,AW4 b)884 AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} 885 #endif 886 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 887 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 888 //_____________________________________________________________/\_______________________________________________________________ 889 //============================================================================================================================== 890 // GLSL DOUBLE 891 //============================================================================================================================== 892 #ifdef A_DUBL 893 #define AD1 double 894 #define AD2 dvec2 895 #define AD3 dvec3 896 #define AD4 dvec4 897 //------------------------------------------------------------------------------------------------------------------------------ AD1_x(AD1 a)898 AD1 AD1_x(AD1 a){return AD1(a);} AD2_x(AD1 a)899 AD2 AD2_x(AD1 a){return AD2(a,a);} AD3_x(AD1 a)900 AD3 AD3_x(AD1 a){return AD3(a,a,a);} AD4_x(AD1 a)901 AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} 902 #define AD1_(a) AD1_x(AD1(a)) 903 #define AD2_(a) AD2_x(AD1(a)) 904 #define AD3_(a) AD3_x(AD1(a)) 905 #define AD4_(a) AD4_x(AD1(a)) 906 //============================================================================================================================== AFractD1(AD1 x)907 AD1 AFractD1(AD1 x){return fract(x);} AFractD2(AD2 x)908 AD2 AFractD2(AD2 x){return fract(x);} AFractD3(AD3 x)909 AD3 AFractD3(AD3 x){return fract(x);} AFractD4(AD4 x)910 AD4 AFractD4(AD4 x){return fract(x);} 911 //------------------------------------------------------------------------------------------------------------------------------ ALerpD1(AD1 x,AD1 y,AD1 a)912 AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} ALerpD2(AD2 x,AD2 y,AD2 a)913 AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} ALerpD3(AD3 x,AD3 y,AD3 a)914 AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} ALerpD4(AD4 x,AD4 y,AD4 a)915 AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} 916 //------------------------------------------------------------------------------------------------------------------------------ ARcpD1(AD1 x)917 AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} ARcpD2(AD2 x)918 AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} ARcpD3(AD3 x)919 AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} ARcpD4(AD4 x)920 AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} 921 //------------------------------------------------------------------------------------------------------------------------------ ARsqD1(AD1 x)922 AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} ARsqD2(AD2 x)923 AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} ARsqD3(AD3 x)924 AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} ARsqD4(AD4 x)925 AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} 926 //------------------------------------------------------------------------------------------------------------------------------ ASatD1(AD1 x)927 AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} ASatD2(AD2 x)928 AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} ASatD3(AD3 x)929 AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} ASatD4(AD4 x)930 AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} 931 #endif 932 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 933 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 934 //_____________________________________________________________/\_______________________________________________________________ 935 //============================================================================================================================== 936 // GLSL LONG 937 //============================================================================================================================== 938 #ifdef A_LONG 939 #define AL1 uint64_t 940 #define AL2 u64vec2 941 #define AL3 u64vec3 942 #define AL4 u64vec4 943 //------------------------------------------------------------------------------------------------------------------------------ 944 #define ASL1 int64_t 945 #define ASL2 i64vec2 946 #define ASL3 i64vec3 947 #define ASL4 i64vec4 948 //------------------------------------------------------------------------------------------------------------------------------ 949 #define AL1_AU2(x) packUint2x32(AU2(x)) 950 #define AU2_AL1(x) unpackUint2x32(AL1(x)) 951 //------------------------------------------------------------------------------------------------------------------------------ AL1_x(AL1 a)952 AL1 AL1_x(AL1 a){return AL1(a);} AL2_x(AL1 a)953 AL2 AL2_x(AL1 a){return AL2(a,a);} AL3_x(AL1 a)954 AL3 AL3_x(AL1 a){return AL3(a,a,a);} AL4_x(AL1 a)955 AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} 956 #define AL1_(a) AL1_x(AL1(a)) 957 #define AL2_(a) AL2_x(AL1(a)) 958 #define AL3_(a) AL3_x(AL1(a)) 959 #define AL4_(a) AL4_x(AL1(a)) 960 //============================================================================================================================== AAbsSL1(AL1 a)961 AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} AAbsSL2(AL2 a)962 AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} AAbsSL3(AL3 a)963 AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} AAbsSL4(AL4 a)964 AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} 965 //------------------------------------------------------------------------------------------------------------------------------ AMaxSL1(AL1 a,AL1 b)966 AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} AMaxSL2(AL2 a,AL2 b)967 AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} AMaxSL3(AL3 a,AL3 b)968 AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} AMaxSL4(AL4 a,AL4 b)969 AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} 970 //------------------------------------------------------------------------------------------------------------------------------ AMinSL1(AL1 a,AL1 b)971 AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} AMinSL2(AL2 a,AL2 b)972 AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} AMinSL3(AL3 a,AL3 b)973 AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} AMinSL4(AL4 a,AL4 b)974 AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} 975 #endif 976 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 977 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 978 //_____________________________________________________________/\_______________________________________________________________ 979 //============================================================================================================================== 980 // WAVE OPERATIONS 981 //============================================================================================================================== 982 #ifdef A_WAVE AWaveAdd(AF1 v)983 AF1 AWaveAdd(AF1 v){return subgroupAdd(v);} AWaveAdd(AF2 v)984 AF2 AWaveAdd(AF2 v){return subgroupAdd(v);} AWaveAdd(AF3 v)985 AF3 AWaveAdd(AF3 v){return subgroupAdd(v);} AWaveAdd(AF4 v)986 AF4 AWaveAdd(AF4 v){return subgroupAdd(v);} 987 #endif 988 //============================================================================================================================== 989 #endif 990 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 991 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 992 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 993 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 994 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 995 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 996 //_____________________________________________________________/\_______________________________________________________________ 997 //============================================================================================================================== 998 // 999 // 1000 // HLSL 1001 // 1002 // 1003 //============================================================================================================================== 1004 #if defined(A_HLSL) && defined(A_GPU) 1005 #define AP1 bool 1006 #define AP2 bool2 1007 #define AP3 bool3 1008 #define AP4 bool4 1009 //------------------------------------------------------------------------------------------------------------------------------ 1010 #define AF1 float 1011 #define AF2 float2 1012 #define AF3 float3 1013 #define AF4 float4 1014 //------------------------------------------------------------------------------------------------------------------------------ 1015 #define AU1 uint 1016 #define AU2 uint2 1017 #define AU3 uint3 1018 #define AU4 uint4 1019 //------------------------------------------------------------------------------------------------------------------------------ 1020 #define ASU1 int 1021 #define ASU2 int2 1022 #define ASU3 int3 1023 #define ASU4 int4 1024 //============================================================================================================================== 1025 #define AF1_AU1(x) asfloat(AU1(x)) 1026 #define AF2_AU2(x) asfloat(AU2(x)) 1027 #define AF3_AU3(x) asfloat(AU3(x)) 1028 #define AF4_AU4(x) asfloat(AU4(x)) 1029 //------------------------------------------------------------------------------------------------------------------------------ 1030 #define AU1_AF1(x) asuint(AF1(x)) 1031 #define AU2_AF2(x) asuint(AF2(x)) 1032 #define AU3_AF3(x) asuint(AF3(x)) 1033 #define AU4_AF4(x) asuint(AF4(x)) 1034 //------------------------------------------------------------------------------------------------------------------------------ AU1_AH2_AF2_x(AF2 a)1035 AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} 1036 #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 1037 #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) 1038 //------------------------------------------------------------------------------------------------------------------------------ AF2_AH2_AU1_x(AU1 x)1039 AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} 1040 #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) 1041 //============================================================================================================================== AF1_x(AF1 a)1042 AF1 AF1_x(AF1 a){return AF1(a);} AF2_x(AF1 a)1043 AF2 AF2_x(AF1 a){return AF2(a,a);} AF3_x(AF1 a)1044 AF3 AF3_x(AF1 a){return AF3(a,a,a);} AF4_x(AF1 a)1045 AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} 1046 #define AF1_(a) AF1_x(AF1(a)) 1047 #define AF2_(a) AF2_x(AF1(a)) 1048 #define AF3_(a) AF3_x(AF1(a)) 1049 #define AF4_(a) AF4_x(AF1(a)) 1050 //------------------------------------------------------------------------------------------------------------------------------ AU1_x(AU1 a)1051 AU1 AU1_x(AU1 a){return AU1(a);} AU2_x(AU1 a)1052 AU2 AU2_x(AU1 a){return AU2(a,a);} AU3_x(AU1 a)1053 AU3 AU3_x(AU1 a){return AU3(a,a,a);} AU4_x(AU1 a)1054 AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} 1055 #define AU1_(a) AU1_x(AU1(a)) 1056 #define AU2_(a) AU2_x(AU1(a)) 1057 #define AU3_(a) AU3_x(AU1(a)) 1058 #define AU4_(a) AU4_x(AU1(a)) 1059 //============================================================================================================================== AAbsSU1(AU1 a)1060 AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} AAbsSU2(AU2 a)1061 AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} AAbsSU3(AU3 a)1062 AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} AAbsSU4(AU4 a)1063 AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} 1064 //------------------------------------------------------------------------------------------------------------------------------ ABfe(AU1 src,AU1 off,AU1 bits)1065 AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1<<bits)-1;return (src>>off)&mask;} ABfi(AU1 src,AU1 ins,AU1 mask)1066 AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} ABfiM(AU1 src,AU1 ins,AU1 bits)1067 AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1<<bits)-1;return (ins&mask)|(src&(~mask));} 1068 //------------------------------------------------------------------------------------------------------------------------------ AFractF1(AF1 x)1069 AF1 AFractF1(AF1 x){return x-floor(x);} AFractF2(AF2 x)1070 AF2 AFractF2(AF2 x){return x-floor(x);} AFractF3(AF3 x)1071 AF3 AFractF3(AF3 x){return x-floor(x);} AFractF4(AF4 x)1072 AF4 AFractF4(AF4 x){return x-floor(x);} 1073 //------------------------------------------------------------------------------------------------------------------------------ ALerpF1(AF1 x,AF1 y,AF1 a)1074 AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);} ALerpF2(AF2 x,AF2 y,AF2 a)1075 AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);} ALerpF3(AF3 x,AF3 y,AF3 a)1076 AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);} ALerpF4(AF4 x,AF4 y,AF4 a)1077 AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);} 1078 //------------------------------------------------------------------------------------------------------------------------------ AMax3F1(AF1 x,AF1 y,AF1 z)1079 AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} AMax3F2(AF2 x,AF2 y,AF2 z)1080 AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} AMax3F3(AF3 x,AF3 y,AF3 z)1081 AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} AMax3F4(AF4 x,AF4 y,AF4 z)1082 AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} 1083 //------------------------------------------------------------------------------------------------------------------------------ AMax3SU1(AU1 x,AU1 y,AU1 z)1084 AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} AMax3SU2(AU2 x,AU2 y,AU2 z)1085 AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} AMax3SU3(AU3 x,AU3 y,AU3 z)1086 AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} AMax3SU4(AU4 x,AU4 y,AU4 z)1087 AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} 1088 //------------------------------------------------------------------------------------------------------------------------------ AMax3U1(AU1 x,AU1 y,AU1 z)1089 AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} AMax3U2(AU2 x,AU2 y,AU2 z)1090 AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} AMax3U3(AU3 x,AU3 y,AU3 z)1091 AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} AMax3U4(AU4 x,AU4 y,AU4 z)1092 AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} 1093 //------------------------------------------------------------------------------------------------------------------------------ AMaxSU1(AU1 a,AU1 b)1094 AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} AMaxSU2(AU2 a,AU2 b)1095 AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} AMaxSU3(AU3 a,AU3 b)1096 AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} AMaxSU4(AU4 a,AU4 b)1097 AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} 1098 //------------------------------------------------------------------------------------------------------------------------------ AMed3F1(AF1 x,AF1 y,AF1 z)1099 AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} AMed3F2(AF2 x,AF2 y,AF2 z)1100 AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} AMed3F3(AF3 x,AF3 y,AF3 z)1101