1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "ShaderCore.hpp" 16 17 #include "Device/Renderer.hpp" 18 #include "Vulkan/VkDebug.hpp" 19 20 #include <limits.h> 21 22 namespace sw 23 { 24 extern TranscendentalPrecision logPrecision; 25 extern TranscendentalPrecision expPrecision; 26 extern TranscendentalPrecision rcpPrecision; 27 extern TranscendentalPrecision rsqPrecision; 28 Vector4s()29 Vector4s::Vector4s() 30 { 31 } 32 Vector4s(unsigned short x,unsigned short y,unsigned short z,unsigned short w)33 Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w) 34 { 35 this->x = Short4(x); 36 this->y = Short4(y); 37 this->z = Short4(z); 38 this->w = Short4(w); 39 } 40 Vector4s(const Vector4s & rhs)41 Vector4s::Vector4s(const Vector4s &rhs) 42 { 43 x = rhs.x; 44 y = rhs.y; 45 z = rhs.z; 46 w = rhs.w; 47 } 48 operator =(const Vector4s & rhs)49 Vector4s &Vector4s::operator=(const Vector4s &rhs) 50 { 51 x = rhs.x; 52 y = rhs.y; 53 z = rhs.z; 54 w = rhs.w; 55 56 return *this; 57 } 58 operator [](int i)59 Short4 &Vector4s::operator[](int i) 60 { 61 switch(i) 62 { 63 case 0: return x; 64 case 1: return y; 65 case 2: return z; 66 case 3: return w; 67 } 68 69 return x; 70 } 71 Vector4f()72 Vector4f::Vector4f() 73 { 74 } 75 Vector4f(float x,float y,float z,float w)76 Vector4f::Vector4f(float x, float y, float z, float w) 77 { 78 this->x = Float4(x); 79 this->y = Float4(y); 80 this->z = Float4(z); 81 this->w = Float4(w); 82 } 83 Vector4f(const Vector4f & rhs)84 Vector4f::Vector4f(const Vector4f &rhs) 85 { 86 x = rhs.x; 87 y = rhs.y; 88 z = rhs.z; 89 w = rhs.w; 90 } 91 operator =(const Vector4f & rhs)92 Vector4f &Vector4f::operator=(const Vector4f &rhs) 93 { 94 x = rhs.x; 95 y = rhs.y; 96 z = rhs.z; 97 w = rhs.w; 98 99 return *this; 100 } 101 operator [](int i)102 Float4 &Vector4f::operator[](int i) 103 { 104 switch(i) 105 { 106 case 0: return x; 107 case 1: return y; 108 case 2: return z; 109 case 3: return w; 110 } 111 112 return x; 113 } 114 exponential2(RValue<Float4> x,bool pp)115 Float4 exponential2(RValue<Float4> x, bool pp) 116 { 117 // This implementation is based on 2^(i + f) = 2^i * 2^f, 118 // where i is the integer part of x and f is the fraction. 119 120 // For 2^i we can put the integer part directly in the exponent of 121 // the IEEE-754 floating-point number. Clamp to prevent overflow 122 // past the representation of infinity. 123 Float4 x0 = x; 124 x0 = Min(x0, As<Float4>(Int4(0x43010000))); // 129.00000e+0f 125 x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF))); // -126.99999e+0f 126 127 Int4 i = RoundInt(x0 - Float4(0.5f)); 128 Float4 ii = As<Float4>((i + Int4(127)) << 23); // Add single-precision bias, and shift into exponent. 129 130 // For the fractional part use a polynomial 131 // which approximates 2^f in the 0 to 1 range. 132 Float4 f = x0 - Float4(i); 133 Float4 ff = As<Float4>(Int4(0x3AF61905)); // 1.8775767e-3f 134 ff = ff * f + As<Float4>(Int4(0x3C134806)); // 8.9893397e-3f 135 ff = ff * f + As<Float4>(Int4(0x3D64AA23)); // 5.5826318e-2f 136 ff = ff * f + As<Float4>(Int4(0x3E75EAD4)); // 2.4015361e-1f 137 ff = ff * f + As<Float4>(Int4(0x3F31727B)); // 6.9315308e-1f 138 ff = ff * f + Float4(1.0f); 139 140 return ii * ff; 141 } 142 logarithm2(RValue<Float4> x,bool absolute,bool pp)143 Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp) 144 { 145 Float4 x0; 146 Float4 x1; 147 Float4 x2; 148 Float4 x3; 149 150 x0 = x; 151 152 x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000)); 153 x1 = As<Float4>(As<UInt4>(x1) >> 8); 154 x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f))); 155 x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f); // FIXME: (x1 - 1.4960938f) * 256.0f; 156 x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f))); 157 158 x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f); 159 x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f); 160 x2 /= x3; 161 162 x1 += (x0 - Float4(1.0f)) * x2; 163 164 Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000)); 165 return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1))); 166 } 167 exponential(RValue<Float4> x,bool pp)168 Float4 exponential(RValue<Float4> x, bool pp) 169 { 170 // FIXME: Propagate the constant 171 return exponential2(Float4(1.44269504f) * x, pp); // 1/ln(2) 172 } 173 logarithm(RValue<Float4> x,bool absolute,bool pp)174 Float4 logarithm(RValue<Float4> x, bool absolute, bool pp) 175 { 176 // FIXME: Propagate the constant 177 return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp); // ln(2) 178 } 179 power(RValue<Float4> x,RValue<Float4> y,bool pp)180 Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp) 181 { 182 Float4 log = logarithm2(x, true, pp); 183 log *= y; 184 return exponential2(log, pp); 185 } 186 reciprocal(RValue<Float4> x,bool pp,bool finite,bool exactAtPow2)187 Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2) 188 { 189 Float4 rcp; 190 191 if(!pp && rcpPrecision >= WHQL) 192 { 193 rcp = Float4(1.0f) / x; 194 } 195 else 196 { 197 rcp = Rcp_pp(x, exactAtPow2); 198 199 if(!pp) 200 { 201 rcp = (rcp + rcp) - (x * rcp * rcp); 202 } 203 } 204 205 if(finite) 206 { 207 int big = 0x7F7FFFFF; 208 rcp = Min(rcp, Float4((float&)big)); 209 } 210 211 return rcp; 212 } 213 reciprocalSquareRoot(RValue<Float4> x,bool absolute,bool pp)214 Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp) 215 { 216 Float4 abs = x; 217 218 if(absolute) 219 { 220 abs = Abs(abs); 221 } 222 223 Float4 rsq; 224 225 if(!pp) 226 { 227 rsq = Float4(1.0f) / Sqrt(abs); 228 } 229 else 230 { 231 rsq = RcpSqrt_pp(abs); 232 233 if(!pp) 234 { 235 rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f); 236 } 237 238 rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq)); 239 } 240 241 return rsq; 242 } 243 modulo(RValue<Float4> x,RValue<Float4> y)244 Float4 modulo(RValue<Float4> x, RValue<Float4> y) 245 { 246 return x - y * Floor(x / y); 247 } 248 sine_pi(RValue<Float4> x,bool pp)249 Float4 sine_pi(RValue<Float4> x, bool pp) 250 { 251 const Float4 A = Float4(-4.05284734e-1f); // -4/pi^2 252 const Float4 B = Float4(1.27323954e+0f); // 4/pi 253 const Float4 C = Float4(7.75160950e-1f); 254 const Float4 D = Float4(2.24839049e-1f); 255 256 // Parabola approximating sine 257 Float4 sin = x * (Abs(x) * A + B); 258 259 // Improve precision from 0.06 to 0.001 260 if(true) 261 { 262 sin = sin * (Abs(sin) * D + C); 263 } 264 265 return sin; 266 } 267 cosine_pi(RValue<Float4> x,bool pp)268 Float4 cosine_pi(RValue<Float4> x, bool pp) 269 { 270 // cos(x) = sin(x + pi/2) 271 Float4 y = x + Float4(1.57079632e+0f); 272 273 // Wrap around 274 y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f))); 275 276 return sine_pi(y, pp); 277 } 278 sine(RValue<Float4> x,bool pp)279 Float4 sine(RValue<Float4> x, bool pp) 280 { 281 // Reduce to [-0.5, 0.5] range 282 Float4 y = x * Float4(1.59154943e-1f); // 1/2pi 283 y = y - Round(y); 284 285 if(!pp) 286 { 287 // From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs" 288 // This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations: 289 // !pp : 17 mul, 7 add, 1 sub, 1 reciprocal 290 // pp : 4 mul, 2 add, 2 abs 291 292 Float4 y2 = y * y; 293 Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f); 294 Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f)); 295 Float4 c2 = (c1 * c1) - (s1 * s1); 296 Float4 s2 = Float4(2.0f) * s1 * c1; 297 return Float4(2.0f) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true); 298 } 299 300 const Float4 A = Float4(-16.0f); 301 const Float4 B = Float4(8.0f); 302 const Float4 C = Float4(7.75160950e-1f); 303 const Float4 D = Float4(2.24839049e-1f); 304 305 // Parabola approximating sine 306 Float4 sin = y * (Abs(y) * A + B); 307 308 // Improve precision from 0.06 to 0.001 309 if(true) 310 { 311 sin = sin * (Abs(sin) * D + C); 312 } 313 314 return sin; 315 } 316 cosine(RValue<Float4> x,bool pp)317 Float4 cosine(RValue<Float4> x, bool pp) 318 { 319 // cos(x) = sin(x + pi/2) 320 Float4 y = x + Float4(1.57079632e+0f); 321 return sine(y, pp); 322 } 323 tangent(RValue<Float4> x,bool pp)324 Float4 tangent(RValue<Float4> x, bool pp) 325 { 326 return sine(x, pp) / cosine(x, pp); 327 } 328 arccos(RValue<Float4> x,bool pp)329 Float4 arccos(RValue<Float4> x, bool pp) 330 { 331 // pi/2 - arcsin(x) 332 return Float4(1.57079632e+0f) - arcsin(x); 333 } 334 arcsin(RValue<Float4> x,bool pp)335 Float4 arcsin(RValue<Float4> x, bool pp) 336 { 337 if(false) // Simpler implementation fails even lowp precision tests 338 { 339 // x*(pi/2-sqrt(1-x*x)*pi/5) 340 return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f)); 341 } 342 else 343 { 344 // From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun 345 const Float4 half_pi(1.57079632f); 346 const Float4 a0(1.5707288f); 347 const Float4 a1(-0.2121144f); 348 const Float4 a2(0.0742610f); 349 const Float4 a3(-0.0187293f); 350 Float4 absx = Abs(x); 351 return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^ 352 (As<Int4>(x) & Int4(0x80000000))); 353 } 354 } 355 356 // Approximation of atan in [0..1] arctan_01(Float4 x,bool pp)357 Float4 arctan_01(Float4 x, bool pp) 358 { 359 if(pp) 360 { 361 return x * (Float4(-0.27f) * x + Float4(1.05539816f)); 362 } 363 else 364 { 365 // From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun 366 const Float4 a2(-0.3333314528f); 367 const Float4 a4(0.1999355085f); 368 const Float4 a6(-0.1420889944f); 369 const Float4 a8(0.1065626393f); 370 const Float4 a10(-0.0752896400f); 371 const Float4 a12(0.0429096138f); 372 const Float4 a14(-0.0161657367f); 373 const Float4 a16(0.0028662257f); 374 Float4 x2 = x * x; 375 return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16))))))))); 376 } 377 } 378 arctan(RValue<Float4> x,bool pp)379 Float4 arctan(RValue<Float4> x, bool pp) 380 { 381 Float4 absx = Abs(x); 382 Int4 O = CmpNLT(absx, Float4(1.0f)); 383 Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select 384 385 const Float4 half_pi(1.57079632f); 386 Float4 theta = arctan_01(y, pp); 387 return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select 388 (As<Int4>(x) & Int4(0x80000000))); 389 } 390 arctan(RValue<Float4> y,RValue<Float4> x,bool pp)391 Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp) 392 { 393 const Float4 pi(3.14159265f); // pi 394 const Float4 minus_pi(-3.14159265f); // -pi 395 const Float4 half_pi(1.57079632f); // pi/2 396 const Float4 quarter_pi(7.85398163e-1f); // pi/4 397 398 // Rotate to upper semicircle when in lower semicircle 399 Int4 S = CmpLT(y, Float4(0.0f)); 400 Float4 theta = As<Float4>(S & As<Int4>(minus_pi)); 401 Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x)); 402 Float4 y0 = Abs(y); 403 404 // Rotate to right quadrant when in left quadrant 405 Int4 Q = CmpLT(x0, Float4(0.0f)); 406 theta += As<Float4>(Q & As<Int4>(half_pi)); 407 Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0))); // FIXME: Vector select 408 Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select 409 410 // Mirror to first octant when in second octant 411 Int4 O = CmpNLT(y1, x1); 412 Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select 413 Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select 414 415 // Approximation of atan in [0..1] 416 Int4 zero_x = CmpEQ(x2, Float4(0.0f)); 417 Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4 418 Float4 atan2_theta = arctan_01(y2 / x2, pp); 419 theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select 420 (inf_y & As<Int4>(quarter_pi))); 421 422 // Recover loss of precision for tiny theta angles 423 Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta 424 return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select 425 } 426 sineh(RValue<Float4> x,bool pp)427 Float4 sineh(RValue<Float4> x, bool pp) 428 { 429 return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f); 430 } 431 cosineh(RValue<Float4> x,bool pp)432 Float4 cosineh(RValue<Float4> x, bool pp) 433 { 434 return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f); 435 } 436 tangenth(RValue<Float4> x,bool pp)437 Float4 tangenth(RValue<Float4> x, bool pp) 438 { 439 Float4 e_x = exponential(x, pp); 440 Float4 e_minus_x = exponential(-x, pp); 441 return (e_x - e_minus_x) / (e_x + e_minus_x); 442 } 443 arccosh(RValue<Float4> x,bool pp)444 Float4 arccosh(RValue<Float4> x, bool pp) 445 { 446 return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp); 447 } 448 arcsinh(RValue<Float4> x,bool pp)449 Float4 arcsinh(RValue<Float4> x, bool pp) 450 { 451 return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp); 452 } 453 arctanh(RValue<Float4> x,bool pp)454 Float4 arctanh(RValue<Float4> x, bool pp) 455 { 456 return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f); 457 } 458 dot2(const Vector4f & v0,const Vector4f & v1)459 Float4 dot2(const Vector4f &v0, const Vector4f &v1) 460 { 461 return v0.x * v1.x + v0.y * v1.y; 462 } 463 dot3(const Vector4f & v0,const Vector4f & v1)464 Float4 dot3(const Vector4f &v0, const Vector4f &v1) 465 { 466 return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z; 467 } 468 dot4(const Vector4f & v0,const Vector4f & v1)469 Float4 dot4(const Vector4f &v0, const Vector4f &v1) 470 { 471 return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w; 472 } 473 transpose4x4(Short4 & row0,Short4 & row1,Short4 & row2,Short4 & row3)474 void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3) 475 { 476 Int2 tmp0 = UnpackHigh(row0, row1); 477 Int2 tmp1 = UnpackHigh(row2, row3); 478 Int2 tmp2 = UnpackLow(row0, row1); 479 Int2 tmp3 = UnpackLow(row2, row3); 480 481 row0 = UnpackLow(tmp2, tmp3); 482 row1 = UnpackHigh(tmp2, tmp3); 483 row2 = UnpackLow(tmp0, tmp1); 484 row3 = UnpackHigh(tmp0, tmp1); 485 } 486 transpose4x3(Short4 & row0,Short4 & row1,Short4 & row2,Short4 & row3)487 void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3) 488 { 489 Int2 tmp0 = UnpackHigh(row0, row1); 490 Int2 tmp1 = UnpackHigh(row2, row3); 491 Int2 tmp2 = UnpackLow(row0, row1); 492 Int2 tmp3 = UnpackLow(row2, row3); 493 494 row0 = UnpackLow(tmp2, tmp3); 495 row1 = UnpackHigh(tmp2, tmp3); 496 row2 = UnpackLow(tmp0, tmp1); 497 } 498 transpose4x4(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)499 void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 500 { 501 Float4 tmp0 = UnpackLow(row0, row1); 502 Float4 tmp1 = UnpackLow(row2, row3); 503 Float4 tmp2 = UnpackHigh(row0, row1); 504 Float4 tmp3 = UnpackHigh(row2, row3); 505 506 row0 = Float4(tmp0.xy, tmp1.xy); 507 row1 = Float4(tmp0.zw, tmp1.zw); 508 row2 = Float4(tmp2.xy, tmp3.xy); 509 row3 = Float4(tmp2.zw, tmp3.zw); 510 } 511 transpose4x3(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)512 void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 513 { 514 Float4 tmp0 = UnpackLow(row0, row1); 515 Float4 tmp1 = UnpackLow(row2, row3); 516 Float4 tmp2 = UnpackHigh(row0, row1); 517 Float4 tmp3 = UnpackHigh(row2, row3); 518 519 row0 = Float4(tmp0.xy, tmp1.xy); 520 row1 = Float4(tmp0.zw, tmp1.zw); 521 row2 = Float4(tmp2.xy, tmp3.xy); 522 } 523 transpose4x2(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)524 void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 525 { 526 Float4 tmp0 = UnpackLow(row0, row1); 527 Float4 tmp1 = UnpackLow(row2, row3); 528 529 row0 = Float4(tmp0.xy, tmp1.xy); 530 row1 = Float4(tmp0.zw, tmp1.zw); 531 } 532 transpose4x1(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)533 void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 534 { 535 Float4 tmp0 = UnpackLow(row0, row1); 536 Float4 tmp1 = UnpackLow(row2, row3); 537 538 row0 = Float4(tmp0.xy, tmp1.xy); 539 } 540 transpose2x4(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)541 void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 542 { 543 Float4 tmp01 = UnpackLow(row0, row1); 544 Float4 tmp23 = UnpackHigh(row0, row1); 545 546 row0 = tmp01; 547 row1 = Float4(tmp01.zw, row1.zw); 548 row2 = tmp23; 549 row3 = Float4(tmp23.zw, row3.zw); 550 } 551 transpose4xN(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3,int N)552 void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N) 553 { 554 switch(N) 555 { 556 case 1: transpose4x1(row0, row1, row2, row3); break; 557 case 2: transpose4x2(row0, row1, row2, row3); break; 558 case 3: transpose4x3(row0, row1, row2, row3); break; 559 case 4: transpose4x4(row0, row1, row2, row3); break; 560 } 561 } 562 operator [](RValue<Int4> index)563 const Vector4f RegisterFile::operator[](RValue<Int4> index) 564 { 565 ASSERT(indirectAddressable); 566 567 Int index0 = Extract(index, 0); 568 Int index1 = Extract(index, 1); 569 Int index2 = Extract(index, 2); 570 Int index3 = Extract(index, 3); 571 572 Vector4f r; 573 574 r.x.x = Extract(x[0][index0], 0); 575 r.x.y = Extract(x[0][index1], 1); 576 r.x.z = Extract(x[0][index2], 2); 577 r.x.w = Extract(x[0][index3], 3); 578 579 r.y.x = Extract(y[0][index0], 0); 580 r.y.y = Extract(y[0][index1], 1); 581 r.y.z = Extract(y[0][index2], 2); 582 r.y.w = Extract(y[0][index3], 3); 583 584 r.z.x = Extract(z[0][index0], 0); 585 r.z.y = Extract(z[0][index1], 1); 586 r.z.z = Extract(z[0][index2], 2); 587 r.z.w = Extract(z[0][index3], 3); 588 589 r.w.x = Extract(w[0][index0], 0); 590 r.w.y = Extract(w[0][index1], 1); 591 r.w.z = Extract(w[0][index2], 2); 592 r.w.w = Extract(w[0][index3], 3); 593 594 return r; 595 } 596 scatter_x(Int4 index,RValue<Float4> r)597 void RegisterFile::scatter_x(Int4 index, RValue<Float4> r) 598 { 599 ASSERT(indirectAddressable); 600 601 Int index0 = Extract(index, 0); 602 Int index1 = Extract(index, 1); 603 Int index2 = Extract(index, 2); 604 Int index3 = Extract(index, 3); 605 606 x[0][index0] = Insert(x[0][index0], Extract(r, 0), 0); 607 x[0][index1] = Insert(x[0][index1], Extract(r, 1), 1); 608 x[0][index2] = Insert(x[0][index2], Extract(r, 2), 2); 609 x[0][index3] = Insert(x[0][index3], Extract(r, 3), 3); 610 } 611 scatter_y(Int4 index,RValue<Float4> r)612 void RegisterFile::scatter_y(Int4 index, RValue<Float4> r) 613 { 614 ASSERT(indirectAddressable); 615 616 Int index0 = Extract(index, 0); 617 Int index1 = Extract(index, 1); 618 Int index2 = Extract(index, 2); 619 Int index3 = Extract(index, 3); 620 621 y[0][index0] = Insert(y[0][index0], Extract(r, 0), 0); 622 y[0][index1] = Insert(y[0][index1], Extract(r, 1), 1); 623 y[0][index2] = Insert(y[0][index2], Extract(r, 2), 2); 624 y[0][index3] = Insert(y[0][index3], Extract(r, 3), 3); 625 } 626 scatter_z(Int4 index,RValue<Float4> r)627 void RegisterFile::scatter_z(Int4 index, RValue<Float4> r) 628 { 629 ASSERT(indirectAddressable); 630 631 Int index0 = Extract(index, 0); 632 Int index1 = Extract(index, 1); 633 Int index2 = Extract(index, 2); 634 Int index3 = Extract(index, 3); 635 636 z[0][index0] = Insert(z[0][index0], Extract(r, 0), 0); 637 z[0][index1] = Insert(z[0][index1], Extract(r, 1), 1); 638 z[0][index2] = Insert(z[0][index2], Extract(r, 2), 2); 639 z[0][index3] = Insert(z[0][index3], Extract(r, 3), 3); 640 } 641 scatter_w(Int4 index,RValue<Float4> r)642 void RegisterFile::scatter_w(Int4 index, RValue<Float4> r) 643 { 644 ASSERT(indirectAddressable); 645 646 Int index0 = Extract(index, 0); 647 Int index1 = Extract(index, 1); 648 Int index2 = Extract(index, 2); 649 Int index3 = Extract(index, 3); 650 651 w[0][index0] = Insert(w[0][index0], Extract(r, 0), 0); 652 w[0][index1] = Insert(w[0][index1], Extract(r, 1), 1); 653 w[0][index2] = Insert(w[0][index2], Extract(r, 2), 2); 654 w[0][index3] = Insert(w[0][index3], Extract(r, 3), 3); 655 } 656 mov(Vector4f & dst,const Vector4f & src,bool integerDestination)657 void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination) 658 { 659 if(integerDestination) 660 { 661 dst.x = As<Float4>(RoundInt(src.x)); 662 dst.y = As<Float4>(RoundInt(src.y)); 663 dst.z = As<Float4>(RoundInt(src.z)); 664 dst.w = As<Float4>(RoundInt(src.w)); 665 } 666 else 667 { 668 dst = src; 669 } 670 } 671 neg(Vector4f & dst,const Vector4f & src)672 void ShaderCore::neg(Vector4f &dst, const Vector4f &src) 673 { 674 dst.x = -src.x; 675 dst.y = -src.y; 676 dst.z = -src.z; 677 dst.w = -src.w; 678 } 679 ineg(Vector4f & dst,const Vector4f & src)680 void ShaderCore::ineg(Vector4f &dst, const Vector4f &src) 681 { 682 dst.x = As<Float4>(-As<Int4>(src.x)); 683 dst.y = As<Float4>(-As<Int4>(src.y)); 684 dst.z = As<Float4>(-As<Int4>(src.z)); 685 dst.w = As<Float4>(-As<Int4>(src.w)); 686 } 687 f2b(Vector4f & dst,const Vector4f & src)688 void ShaderCore::f2b(Vector4f &dst, const Vector4f &src) 689 { 690 dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f))); 691 dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f))); 692 dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f))); 693 dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f))); 694 } 695 b2f(Vector4f & dst,const Vector4f & src)696 void ShaderCore::b2f(Vector4f &dst, const Vector4f &src) 697 { 698 dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f))); 699 dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f))); 700 dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f))); 701 dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f))); 702 } 703 f2i(Vector4f & dst,const Vector4f & src)704 void ShaderCore::f2i(Vector4f &dst, const Vector4f &src) 705 { 706 dst.x = As<Float4>(Int4(src.x)); 707 dst.y = As<Float4>(Int4(src.y)); 708 dst.z = As<Float4>(Int4(src.z)); 709 dst.w = As<Float4>(Int4(src.w)); 710 } 711 i2f(Vector4f & dst,const Vector4f & src)712 void ShaderCore::i2f(Vector4f &dst, const Vector4f &src) 713 { 714 dst.x = Float4(As<Int4>(src.x)); 715 dst.y = Float4(As<Int4>(src.y)); 716 dst.z = Float4(As<Int4>(src.z)); 717 dst.w = Float4(As<Int4>(src.w)); 718 } 719 f2u(Vector4f & dst,const Vector4f & src)720 void ShaderCore::f2u(Vector4f &dst, const Vector4f &src) 721 { 722 dst.x = As<Float4>(UInt4(src.x)); 723 dst.y = As<Float4>(UInt4(src.y)); 724 dst.z = As<Float4>(UInt4(src.z)); 725 dst.w = As<Float4>(UInt4(src.w)); 726 } 727 u2f(Vector4f & dst,const Vector4f & src)728 void ShaderCore::u2f(Vector4f &dst, const Vector4f &src) 729 { 730 dst.x = Float4(As<UInt4>(src.x)); 731 dst.y = Float4(As<UInt4>(src.y)); 732 dst.z = Float4(As<UInt4>(src.z)); 733 dst.w = Float4(As<UInt4>(src.w)); 734 } 735 i2b(Vector4f & dst,const Vector4f & src)736 void ShaderCore::i2b(Vector4f &dst, const Vector4f &src) 737 { 738 dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0))); 739 dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0))); 740 dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0))); 741 dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0))); 742 } 743 b2i(Vector4f & dst,const Vector4f & src)744 void ShaderCore::b2i(Vector4f &dst, const Vector4f &src) 745 { 746 dst.x = As<Float4>(As<Int4>(src.x) & Int4(1)); 747 dst.y = As<Float4>(As<Int4>(src.y) & Int4(1)); 748 dst.z = As<Float4>(As<Int4>(src.z) & Int4(1)); 749 dst.w = As<Float4>(As<Int4>(src.w) & Int4(1)); 750 } 751 add(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)752 void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 753 { 754 dst.x = src0.x + src1.x; 755 dst.y = src0.y + src1.y; 756 dst.z = src0.z + src1.z; 757 dst.w = src0.w + src1.w; 758 } 759 iadd(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)760 void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 761 { 762 dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x)); 763 dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y)); 764 dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z)); 765 dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w)); 766 } 767 sub(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)768 void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 769 { 770 dst.x = src0.x - src1.x; 771 dst.y = src0.y - src1.y; 772 dst.z = src0.z - src1.z; 773 dst.w = src0.w - src1.w; 774 } 775 isub(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)776 void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 777 { 778 dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x)); 779 dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y)); 780 dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z)); 781 dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w)); 782 } 783 mad(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)784 void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 785 { 786 dst.x = src0.x * src1.x + src2.x; 787 dst.y = src0.y * src1.y + src2.y; 788 dst.z = src0.z * src1.z + src2.z; 789 dst.w = src0.w * src1.w + src2.w; 790 } 791 imad(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)792 void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 793 { 794 dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x)); 795 dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y)); 796 dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z)); 797 dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w)); 798 } 799 mul(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)800 void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 801 { 802 dst.x = src0.x * src1.x; 803 dst.y = src0.y * src1.y; 804 dst.z = src0.z * src1.z; 805 dst.w = src0.w * src1.w; 806 } 807 imul(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)808 void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 809 { 810 dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x)); 811 dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y)); 812 dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z)); 813 dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w)); 814 } 815 rcpx(Vector4f & dst,const Vector4f & src,bool pp)816 void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp) 817 { 818 Float4 rcp = reciprocal(src.x, pp, true, true); 819 820 dst.x = rcp; 821 dst.y = rcp; 822 dst.z = rcp; 823 dst.w = rcp; 824 } 825 div(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)826 void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 827 { 828 dst.x = src0.x / src1.x; 829 dst.y = src0.y / src1.y; 830 dst.z = src0.z / src1.z; 831 dst.w = src0.w / src1.w; 832 } 833 idiv(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)834 void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 835 { 836 Float4 intMax(As<Float4>(Int4(INT_MAX))); 837 cmp0i(dst.x, src1.x, intMax, src1.x); 838 dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x)); 839 cmp0i(dst.y, src1.y, intMax, src1.y); 840 dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y)); 841 cmp0i(dst.z, src1.z, intMax, src1.z); 842 dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z)); 843 cmp0i(dst.w, src1.w, intMax, src1.w); 844 dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w)); 845 } 846 udiv(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)847 void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 848 { 849 Float4 uintMax(As<Float4>(UInt4(UINT_MAX))); 850 cmp0i(dst.x, src1.x, uintMax, src1.x); 851 dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x)); 852 cmp0i(dst.y, src1.y, uintMax, src1.y); 853 dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y)); 854 cmp0i(dst.z, src1.z, uintMax, src1.z); 855 dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z)); 856 cmp0i(dst.w, src1.w, uintMax, src1.w); 857 dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w)); 858 } 859 mod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)860 void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 861 { 862 dst.x = modulo(src0.x, src1.x); 863 dst.y = modulo(src0.y, src1.y); 864 dst.z = modulo(src0.z, src1.z); 865 dst.w = modulo(src0.w, src1.w); 866 } 867 imod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)868 void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 869 { 870 Float4 intMax(As<Float4>(Int4(INT_MAX))); 871 cmp0i(dst.x, src1.x, intMax, src1.x); 872 dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x)); 873 cmp0i(dst.y, src1.y, intMax, src1.y); 874 dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y)); 875 cmp0i(dst.z, src1.z, intMax, src1.z); 876 dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z)); 877 cmp0i(dst.w, src1.w, intMax, src1.w); 878 dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w)); 879 } 880 umod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)881 void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 882 { 883 Float4 uintMax(As<Float4>(UInt4(UINT_MAX))); 884 cmp0i(dst.x, src1.x, uintMax, src1.x); 885 dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x)); 886 cmp0i(dst.y, src1.y, uintMax, src1.y); 887 dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y)); 888 cmp0i(dst.z, src1.z, uintMax, src1.z); 889 dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z)); 890 cmp0i(dst.w, src1.w, uintMax, src1.w); 891 dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w)); 892 } 893 shl(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)894 void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 895 { 896 dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x)); 897 dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y)); 898 dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z)); 899 dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w)); 900 } 901 ishr(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)902 void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 903 { 904 dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x)); 905 dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y)); 906 dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z)); 907 dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w)); 908 } 909 ushr(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)910 void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 911 { 912 dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x)); 913 dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y)); 914 dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z)); 915 dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w)); 916 } 917 rsqx(Vector4f & dst,const Vector4f & src,bool pp)918 void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp) 919 { 920 Float4 rsq = reciprocalSquareRoot(src.x, true, pp); 921 922 dst.x = rsq; 923 dst.y = rsq; 924 dst.z = rsq; 925 dst.w = rsq; 926 } 927 sqrt(Vector4f & dst,const Vector4f & src,bool pp)928 void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp) 929 { 930 dst.x = Sqrt(src.x); 931 dst.y = Sqrt(src.y); 932 dst.z = Sqrt(src.z); 933 dst.w = Sqrt(src.w); 934 } 935 rsq(Vector4f & dst,const Vector4f & src,bool pp)936 void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp) 937 { 938 dst.x = reciprocalSquareRoot(src.x, false, pp); 939 dst.y = reciprocalSquareRoot(src.y, false, pp); 940 dst.z = reciprocalSquareRoot(src.z, false, pp); 941 dst.w = reciprocalSquareRoot(src.w, false, pp); 942 } 943 len2(Float4 & dst,const Vector4f & src,bool pp)944 void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp) 945 { 946 dst = Sqrt(dot2(src, src)); 947 } 948 len3(Float4 & dst,const Vector4f & src,bool pp)949 void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp) 950 { 951 dst = Sqrt(dot3(src, src)); 952 } 953 len4(Float4 & dst,const Vector4f & src,bool pp)954 void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp) 955 { 956 dst = Sqrt(dot4(src, src)); 957 } 958 dist1(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)959 void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 960 { 961 dst = Abs(src0.x - src1.x); 962 } 963 dist2(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)964 void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 965 { 966 Float4 dx = src0.x - src1.x; 967 Float4 dy = src0.y - src1.y; 968 Float4 dot2 = dx * dx + dy * dy; 969 dst = Sqrt(dot2); 970 } 971 dist3(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)972 void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 973 { 974 Float4 dx = src0.x - src1.x; 975 Float4 dy = src0.y - src1.y; 976 Float4 dz = src0.z - src1.z; 977 Float4 dot3 = dx * dx + dy * dy + dz * dz; 978 dst = Sqrt(dot3); 979 } 980 dist4(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)981 void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 982 { 983 Float4 dx = src0.x - src1.x; 984 Float4 dy = src0.y - src1.y; 985 Float4 dz = src0.z - src1.z; 986 Float4 dw = src0.w - src1.w; 987 Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw; 988 dst = Sqrt(dot4); 989 } 990 dp1(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)991 void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 992 { 993 Float4 t = src0.x * src1.x; 994 995 dst.x = t; 996 dst.y = t; 997 dst.z = t; 998 dst.w = t; 999 } 1000 dp2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1001 void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1002 { 1003 Float4 t = dot2(src0, src1); 1004 1005 dst.x = t; 1006 dst.y = t; 1007 dst.z = t; 1008 dst.w = t; 1009 } 1010 dp2add(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1011 void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1012 { 1013 Float4 t = dot2(src0, src1) + src2.x; 1014 1015 dst.x = t; 1016 dst.y = t; 1017 dst.z = t; 1018 dst.w = t; 1019 } 1020 dp3(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1021 void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1022 { 1023 Float4 dot = dot3(src0, src1); 1024 1025 dst.x = dot; 1026 dst.y = dot; 1027 dst.z = dot; 1028 dst.w = dot; 1029 } 1030 dp4(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1031 void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1032 { 1033 Float4 dot = dot4(src0, src1); 1034 1035 dst.x = dot; 1036 dst.y = dot; 1037 dst.z = dot; 1038 dst.w = dot; 1039 } 1040 min(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1041 void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1042 { 1043 dst.x = Min(src0.x, src1.x); 1044 dst.y = Min(src0.y, src1.y); 1045 dst.z = Min(src0.z, src1.z); 1046 dst.w = Min(src0.w, src1.w); 1047 } 1048 imin(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1049 void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1050 { 1051 dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x))); 1052 dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y))); 1053 dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z))); 1054 dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w))); 1055 } 1056 umin(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1057 void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1058 { 1059 dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1060 dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1061 dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1062 dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1063 } 1064 max(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1065 void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1066 { 1067 dst.x = Max(src0.x, src1.x); 1068 dst.y = Max(src0.y, src1.y); 1069 dst.z = Max(src0.z, src1.z); 1070 dst.w = Max(src0.w, src1.w); 1071 } 1072 imax(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1073 void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1074 { 1075 dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x))); 1076 dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y))); 1077 dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z))); 1078 dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w))); 1079 } 1080 umax(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1081 void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1082 { 1083 dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x))); 1084 dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y))); 1085 dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z))); 1086 dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w))); 1087 } 1088 slt(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1089 void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1090 { 1091 dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f))); 1092 dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f))); 1093 dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f))); 1094 dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f))); 1095 } 1096 step(Vector4f & dst,const Vector4f & edge,const Vector4f & x)1097 void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x) 1098 { 1099 dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f))); 1100 dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f))); 1101 dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f))); 1102 dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f))); 1103 } 1104 exp2x(Vector4f & dst,const Vector4f & src,bool pp)1105 void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp) 1106 { 1107 Float4 exp = exponential2(src.x, pp); 1108 1109 dst.x = exp; 1110 dst.y = exp; 1111 dst.z = exp; 1112 dst.w = exp; 1113 } 1114 exp2(Vector4f & dst,const Vector4f & src,bool pp)1115 void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp) 1116 { 1117 dst.x = exponential2(src.x, pp); 1118 dst.y = exponential2(src.y, pp); 1119 dst.z = exponential2(src.z, pp); 1120 dst.w = exponential2(src.w, pp); 1121 } 1122 exp(Vector4f & dst,const Vector4f & src,bool pp)1123 void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp) 1124 { 1125 dst.x = exponential(src.x, pp); 1126 dst.y = exponential(src.y, pp); 1127 dst.z = exponential(src.z, pp); 1128 dst.w = exponential(src.w, pp); 1129 } 1130 log2x(Vector4f & dst,const Vector4f & src,bool pp)1131 void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp) 1132 { 1133 Float4 log = logarithm2(src.x, true, pp); 1134 1135 dst.x = log; 1136 dst.y = log; 1137 dst.z = log; 1138 dst.w = log; 1139 } 1140 log2(Vector4f & dst,const Vector4f & src,bool pp)1141 void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp) 1142 { 1143 dst.x = logarithm2(src.x, false, pp); 1144 dst.y = logarithm2(src.y, false, pp); 1145 dst.z = logarithm2(src.z, false, pp); 1146 dst.w = logarithm2(src.w, false, pp); 1147 } 1148 log(Vector4f & dst,const Vector4f & src,bool pp)1149 void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp) 1150 { 1151 dst.x = logarithm(src.x, false, pp); 1152 dst.y = logarithm(src.y, false, pp); 1153 dst.z = logarithm(src.z, false, pp); 1154 dst.w = logarithm(src.w, false, pp); 1155 } 1156 lit(Vector4f & dst,const Vector4f & src)1157 void ShaderCore::lit(Vector4f &dst, const Vector4f &src) 1158 { 1159 dst.x = Float4(1.0f); 1160 dst.y = Max(src.x, Float4(0.0f)); 1161 1162 Float4 pow; 1163 1164 pow = src.w; 1165 pow = Min(pow, Float4(127.9961f)); 1166 pow = Max(pow, Float4(-127.9961f)); 1167 1168 dst.z = power(src.y, pow); 1169 dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f))); 1170 dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f))); 1171 1172 dst.w = Float4(1.0f); 1173 } 1174 att(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1175 void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1176 { 1177 // Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d 1178 dst.x = 1; 1179 dst.y = src0.y * src1.y; 1180 dst.z = src0.z; 1181 dst.w = src1.w; 1182 } 1183 lrp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1184 void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1185 { 1186 dst.x = src0.x * (src1.x - src2.x) + src2.x; 1187 dst.y = src0.y * (src1.y - src2.y) + src2.y; 1188 dst.z = src0.z * (src1.z - src2.z) + src2.z; 1189 dst.w = src0.w * (src1.w - src2.w) + src2.w; 1190 } 1191 isinf(Vector4f & dst,const Vector4f & src)1192 void ShaderCore::isinf(Vector4f &dst, const Vector4f &src) 1193 { 1194 dst.x = As<Float4>(IsInf(src.x)); 1195 dst.y = As<Float4>(IsInf(src.y)); 1196 dst.z = As<Float4>(IsInf(src.z)); 1197 dst.w = As<Float4>(IsInf(src.w)); 1198 } 1199 isnan(Vector4f & dst,const Vector4f & src)1200 void ShaderCore::isnan(Vector4f &dst, const Vector4f &src) 1201 { 1202 dst.x = As<Float4>(IsNan(src.x)); 1203 dst.y = As<Float4>(IsNan(src.y)); 1204 dst.z = As<Float4>(IsNan(src.z)); 1205 dst.w = As<Float4>(IsNan(src.w)); 1206 } 1207 smooth(Vector4f & dst,const Vector4f & edge0,const Vector4f & edge1,const Vector4f & x)1208 void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x) 1209 { 1210 Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx); 1211 Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty); 1212 Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz); 1213 Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw); 1214 } 1215 floatToHalfBits(Float4 & dst,const Float4 & floatBits,bool storeInUpperBits)1216 void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits) 1217 { 1218 static const uint32_t mask_sign = 0x80000000u; 1219 static const uint32_t mask_round = ~0xfffu; 1220 static const uint32_t c_f32infty = 255 << 23; 1221 static const uint32_t c_magic = 15 << 23; 1222 static const uint32_t c_nanbit = 0x200; 1223 static const uint32_t c_infty_as_fp16 = 0x7c00; 1224 static const uint32_t c_clamp = (31 << 23) - 0x1000; 1225 1226 UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits); 1227 UInt4 absf = As<UInt4>(floatBits) ^ justsign; 1228 UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf); 1229 1230 // Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf 1231 // instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation) 1232 UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)), 1233 As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) | 1234 ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) | 1235 UInt4(c_infty_as_fp16))); 1236 1237 dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16)); 1238 } 1239 halfToFloatBits(Float4 & dst,const Float4 & halfBits)1240 void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits) 1241 { 1242 static const uint32_t mask_nosign = 0x7FFF; 1243 static const uint32_t magic = (254 - 15) << 23; 1244 static const uint32_t was_infnan = 0x7BFF; 1245 static const uint32_t exp_infnan = 255 << 23; 1246 1247 UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign); 1248 dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) | 1249 ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) | 1250 (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan))); 1251 } 1252 packHalf2x16(Vector4f & d,const Vector4f & s0)1253 void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0) 1254 { 1255 // half2 | half1 1256 floatToHalfBits(d.x, s0.x, false); 1257 floatToHalfBits(d.x, s0.y, true); 1258 } 1259 unpackHalf2x16(Vector4f & dst,const Vector4f & s0)1260 void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0) 1261 { 1262 // half2 | half1 1263 halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF))); 1264 halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16)); 1265 } 1266 packSnorm2x16(Vector4f & d,const Vector4f & s0)1267 void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0) 1268 { 1269 // round(clamp(c, -1.0, 1.0) * 32767.0) 1270 d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) | 1271 ((Int4(Round(Min(Max(s0.y, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) << 16)); 1272 } 1273 packUnorm2x16(Vector4f & d,const Vector4f & s0)1274 void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0) 1275 { 1276 // round(clamp(c, 0.0, 1.0) * 65535.0) 1277 d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) | 1278 ((Int4(Round(Min(Max(s0.y, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) << 16)); 1279 } 1280 unpackSnorm2x16(Vector4f & dst,const Vector4f & s0)1281 void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0) 1282 { 1283 // clamp(f / 32727.0, -1.0, 1.0) 1284 dst.x = Min(Max(Float4(As<Int4>((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16)) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f)); 1285 dst.y = Min(Max(Float4(As<Int4>(As<UInt4>(s0.x) & UInt4(0xFFFF0000))) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f)); 1286 } 1287 unpackUnorm2x16(Vector4f & dst,const Vector4f & s0)1288 void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0) 1289 { 1290 // f / 65535.0 1291 dst.x = Float4((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16) * Float4(1.0f / float(0xFFFF0000)); 1292 dst.y = Float4(As<UInt4>(s0.x) & UInt4(0xFFFF0000)) * Float4(1.0f / float(0xFFFF0000)); 1293 } 1294 det2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1295 void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1296 { 1297 dst.x = src0.x * src1.y - src0.y * src1.x; 1298 dst.y = dst.z = dst.w = dst.x; 1299 } 1300 det3(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1301 void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1302 { 1303 crs(dst, src1, src2); 1304 dp3(dst, dst, src0); 1305 } 1306 det4(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2,const Vector4f & src3)1307 void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3) 1308 { 1309 dst.x = src2.z * src3.w - src2.w * src3.z; 1310 dst.y = src1.w * src3.z - src1.z * src3.w; 1311 dst.z = src1.z * src2.w - src1.w * src2.z; 1312 dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) - 1313 src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) + 1314 src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) + 1315 src2.x * (src1.w * src3.y - src1.y * src3.w) + 1316 src3.x * (src1.y * src2.w - src1.w * src2.y)) + 1317 src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) + 1318 src2.x * (src1.y * src3.z - src1.z * src3.y) + 1319 src3.x * (src1.z * src2.y - src1.y * src2.z)); 1320 dst.y = dst.z = dst.w = dst.x; 1321 } 1322 frc(Vector4f & dst,const Vector4f & src)1323 void ShaderCore::frc(Vector4f &dst, const Vector4f &src) 1324 { 1325 dst.x = Frac(src.x); 1326 dst.y = Frac(src.y); 1327 dst.z = Frac(src.z); 1328 dst.w = Frac(src.w); 1329 } 1330 trunc(Vector4f & dst,const Vector4f & src)1331 void ShaderCore::trunc(Vector4f &dst, const Vector4f &src) 1332 { 1333 dst.x = Trunc(src.x); 1334 dst.y = Trunc(src.y); 1335 dst.z = Trunc(src.z); 1336 dst.w = Trunc(src.w); 1337 } 1338 floor(Vector4f & dst,const Vector4f & src)1339 void ShaderCore::floor(Vector4f &dst, const Vector4f &src) 1340 { 1341 dst.x = Floor(src.x); 1342 dst.y = Floor(src.y); 1343 dst.z = Floor(src.z); 1344 dst.w = Floor(src.w); 1345 } 1346 round(Vector4f & dst,const Vector4f & src)1347 void ShaderCore::round(Vector4f &dst, const Vector4f &src) 1348 { 1349 dst.x = Round(src.x); 1350 dst.y = Round(src.y); 1351 dst.z = Round(src.z); 1352 dst.w = Round(src.w); 1353 } 1354 roundEven(Vector4f & dst,const Vector4f & src)1355 void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src) 1356 { 1357 // dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src)); 1358 // ex.: 1.5: 2 + (0 * 2 - 1) * 1 * 0 = 2 1359 // 2.5: 3 + (0 * 2 - 1) * 1 * 1 = 2 1360 // -1.5: -2 + (1 * 2 - 1) * 1 * 0 = -2 1361 // -2.5: -3 + (1 * 2 - 1) * 1 * 1 = -2 1362 // Even if the round implementation rounds the other way: 1363 // 1.5: 1 + (1 * 2 - 1) * 1 * 1 = 2 1364 // 2.5: 2 + (1 * 2 - 1) * 1 * 0 = 2 1365 // -1.5: -1 + (0 * 2 - 1) * 1 * 1 = -2 1366 // -2.5: -2 + (0 * 2 - 1) * 1 * 0 = -2 1367 round(dst, src); 1368 dst.x += ((Float4(CmpLT(dst.x, src.x) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.x), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.x) & Int4(1)); 1369 dst.y += ((Float4(CmpLT(dst.y, src.y) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.y), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.y) & Int4(1)); 1370 dst.z += ((Float4(CmpLT(dst.z, src.z) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.z), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.z) & Int4(1)); 1371 dst.w += ((Float4(CmpLT(dst.w, src.w) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.w), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.w) & Int4(1)); 1372 } 1373 ceil(Vector4f & dst,const Vector4f & src)1374 void ShaderCore::ceil(Vector4f &dst, const Vector4f &src) 1375 { 1376 dst.x = Ceil(src.x); 1377 dst.y = Ceil(src.y); 1378 dst.z = Ceil(src.z); 1379 dst.w = Ceil(src.w); 1380 } 1381 powx(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1382 void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 1383 { 1384 Float4 pow = power(src0.x, src1.x, pp); 1385 1386 dst.x = pow; 1387 dst.y = pow; 1388 dst.z = pow; 1389 dst.w = pow; 1390 } 1391 pow(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1392 void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 1393 { 1394 dst.x = power(src0.x, src1.x, pp); 1395 dst.y = power(src0.y, src1.y, pp); 1396 dst.z = power(src0.z, src1.z, pp); 1397 dst.w = power(src0.w, src1.w, pp); 1398 } 1399 crs(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1400 void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1401 { 1402 dst.x = src0.y * src1.z - src0.z * src1.y; 1403 dst.y = src0.z * src1.x - src0.x * src1.z; 1404 dst.z = src0.x * src1.y - src0.y * src1.x; 1405 } 1406 forward1(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1407 void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) 1408 { 1409 Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000); 1410 1411 dst.x = As<Float4>(flip ^ As<Int4>(N.x)); 1412 } 1413 forward2(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1414 void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) 1415 { 1416 Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000); 1417 1418 dst.x = As<Float4>(flip ^ As<Int4>(N.x)); 1419 dst.y = As<Float4>(flip ^ As<Int4>(N.y)); 1420 } 1421 forward3(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1422 void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) 1423 { 1424 Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000); 1425 1426 dst.x = As<Float4>(flip ^ As<Int4>(N.x)); 1427 dst.y = As<Float4>(flip ^ As<Int4>(N.y)); 1428 dst.z = As<Float4>(flip ^ As<Int4>(N.z)); 1429 } 1430 forward4(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1431 void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) 1432 { 1433 Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000); 1434 1435 dst.x = As<Float4>(flip ^ As<Int4>(N.x)); 1436 dst.y = As<Float4>(flip ^ As<Int4>(N.y)); 1437 dst.z = As<Float4>(flip ^ As<Int4>(N.z)); 1438 dst.w = As<Float4>(flip ^ As<Int4>(N.w)); 1439 } 1440 reflect1(Vector4f & dst,const Vector4f & I,const Vector4f & N)1441 void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N) 1442 { 1443 Float4 d = N.x * I.x; 1444 1445 dst.x = I.x - Float4(2.0f) * d * N.x; 1446 } 1447 reflect2(Vector4f & dst,const Vector4f & I,const Vector4f & N)1448 void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N) 1449 { 1450 Float4 d = dot2(N, I); 1451 1452 dst.x = I.x - Float4(2.0f) * d * N.x; 1453 dst.y = I.y - Float4(2.0f) * d * N.y; 1454 } 1455 reflect3(Vector4f & dst,const Vector4f & I,const Vector4f & N)1456 void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N) 1457 { 1458 Float4 d = dot3(N, I); 1459 1460 dst.x = I.x - Float4(2.0f) * d * N.x; 1461 dst.y = I.y - Float4(2.0f) * d * N.y; 1462 dst.z = I.z - Float4(2.0f) * d * N.z; 1463 } 1464 reflect4(Vector4f & dst,const Vector4f & I,const Vector4f & N)1465 void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N) 1466 { 1467 Float4 d = dot4(N, I); 1468 1469 dst.x = I.x - Float4(2.0f) * d * N.x; 1470 dst.y = I.y - Float4(2.0f) * d * N.y; 1471 dst.z = I.z - Float4(2.0f) * d * N.z; 1472 dst.w = I.w - Float4(2.0f) * d * N.w; 1473 } 1474 refract1(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1475 void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) 1476 { 1477 Float4 d = N.x * I.x; 1478 Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); 1479 Int4 pos = CmpNLT(k, Float4(0.0f)); 1480 Float4 t = (eta * d + Sqrt(k)); 1481 1482 dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); 1483 } 1484 refract2(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1485 void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) 1486 { 1487 Float4 d = dot2(N, I); 1488 Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); 1489 Int4 pos = CmpNLT(k, Float4(0.0f)); 1490 Float4 t = (eta * d + Sqrt(k)); 1491 1492 dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); 1493 dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y)); 1494 } 1495 refract3(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1496 void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) 1497 { 1498 Float4 d = dot3(N, I); 1499 Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); 1500 Int4 pos = CmpNLT(k, Float4(0.0f)); 1501 Float4 t = (eta * d + Sqrt(k)); 1502 1503 dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); 1504 dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y)); 1505 dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z)); 1506 } 1507 refract4(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1508 void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) 1509 { 1510 Float4 d = dot4(N, I); 1511 Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); 1512 Int4 pos = CmpNLT(k, Float4(0.0f)); 1513 Float4 t = (eta * d + Sqrt(k)); 1514 1515 dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); 1516 dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y)); 1517 dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z)); 1518 dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w)); 1519 } 1520 sgn(Vector4f & dst,const Vector4f & src)1521 void ShaderCore::sgn(Vector4f &dst, const Vector4f &src) 1522 { 1523 sgn(dst.x, src.x); 1524 sgn(dst.y, src.y); 1525 sgn(dst.z, src.z); 1526 sgn(dst.w, src.w); 1527 } 1528 isgn(Vector4f & dst,const Vector4f & src)1529 void ShaderCore::isgn(Vector4f &dst, const Vector4f &src) 1530 { 1531 isgn(dst.x, src.x); 1532 isgn(dst.y, src.y); 1533 isgn(dst.z, src.z); 1534 isgn(dst.w, src.w); 1535 } 1536 abs(Vector4f & dst,const Vector4f & src)1537 void ShaderCore::abs(Vector4f &dst, const Vector4f &src) 1538 { 1539 dst.x = Abs(src.x); 1540 dst.y = Abs(src.y); 1541 dst.z = Abs(src.z); 1542 dst.w = Abs(src.w); 1543 } 1544 iabs(Vector4f & dst,const Vector4f & src)1545 void ShaderCore::iabs(Vector4f &dst, const Vector4f &src) 1546 { 1547 dst.x = As<Float4>(Abs(As<Int4>(src.x))); 1548 dst.y = As<Float4>(Abs(As<Int4>(src.y))); 1549 dst.z = As<Float4>(Abs(As<Int4>(src.z))); 1550 dst.w = As<Float4>(Abs(As<Int4>(src.w))); 1551 } 1552 nrm2(Vector4f & dst,const Vector4f & src,bool pp)1553 void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp) 1554 { 1555 Float4 dot = dot2(src, src); 1556 Float4 rsq = reciprocalSquareRoot(dot, false, pp); 1557 1558 dst.x = src.x * rsq; 1559 dst.y = src.y * rsq; 1560 dst.z = src.z * rsq; 1561 dst.w = src.w * rsq; 1562 } 1563 nrm3(Vector4f & dst,const Vector4f & src,bool pp)1564 void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp) 1565 { 1566 Float4 dot = dot3(src, src); 1567 Float4 rsq = reciprocalSquareRoot(dot, false, pp); 1568 1569 dst.x = src.x * rsq; 1570 dst.y = src.y * rsq; 1571 dst.z = src.z * rsq; 1572 dst.w = src.w * rsq; 1573 } 1574 nrm4(Vector4f & dst,const Vector4f & src,bool pp)1575 void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp) 1576 { 1577 Float4 dot = dot4(src, src); 1578 Float4 rsq = reciprocalSquareRoot(dot, false, pp); 1579 1580 dst.x = src.x * rsq; 1581 dst.y = src.y * rsq; 1582 dst.z = src.z * rsq; 1583 dst.w = src.w * rsq; 1584 } 1585 sincos(Vector4f & dst,const Vector4f & src,bool pp)1586 void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp) 1587 { 1588 dst.x = cosine_pi(src.x, pp); 1589 dst.y = sine_pi(src.x, pp); 1590 } 1591 cos(Vector4f & dst,const Vector4f & src,bool pp)1592 void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp) 1593 { 1594 dst.x = cosine(src.x, pp); 1595 dst.y = cosine(src.y, pp); 1596 dst.z = cosine(src.z, pp); 1597 dst.w = cosine(src.w, pp); 1598 } 1599 sin(Vector4f & dst,const Vector4f & src,bool pp)1600 void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp) 1601 { 1602 dst.x = sine(src.x, pp); 1603 dst.y = sine(src.y, pp); 1604 dst.z = sine(src.z, pp); 1605 dst.w = sine(src.w, pp); 1606 } 1607 tan(Vector4f & dst,const Vector4f & src,bool pp)1608 void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp) 1609 { 1610 dst.x = tangent(src.x, pp); 1611 dst.y = tangent(src.y, pp); 1612 dst.z = tangent(src.z, pp); 1613 dst.w = tangent(src.w, pp); 1614 } 1615 acos(Vector4f & dst,const Vector4f & src,bool pp)1616 void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp) 1617 { 1618 dst.x = arccos(src.x, pp); 1619 dst.y = arccos(src.y, pp); 1620 dst.z = arccos(src.z, pp); 1621 dst.w = arccos(src.w, pp); 1622 } 1623 asin(Vector4f & dst,const Vector4f & src,bool pp)1624 void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp) 1625 { 1626 dst.x = arcsin(src.x, pp); 1627 dst.y = arcsin(src.y, pp); 1628 dst.z = arcsin(src.z, pp); 1629 dst.w = arcsin(src.w, pp); 1630 } 1631 atan(Vector4f & dst,const Vector4f & src,bool pp)1632 void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp) 1633 { 1634 dst.x = arctan(src.x, pp); 1635 dst.y = arctan(src.y, pp); 1636 dst.z = arctan(src.z, pp); 1637 dst.w = arctan(src.w, pp); 1638 } 1639 atan2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1640 void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 1641 { 1642 dst.x = arctan(src0.x, src1.x, pp); 1643 dst.y = arctan(src0.y, src1.y, pp); 1644 dst.z = arctan(src0.z, src1.z, pp); 1645 dst.w = arctan(src0.w, src1.w, pp); 1646 } 1647 cosh(Vector4f & dst,const Vector4f & src,bool pp)1648 void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp) 1649 { 1650 dst.x = cosineh(src.x, pp); 1651 dst.y = cosineh(src.y, pp); 1652 dst.z = cosineh(src.z, pp); 1653 dst.w = cosineh(src.w, pp); 1654 } 1655 sinh(Vector4f & dst,const Vector4f & src,bool pp)1656 void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp) 1657 { 1658 dst.x = sineh(src.x, pp); 1659 dst.y = sineh(src.y, pp); 1660 dst.z = sineh(src.z, pp); 1661 dst.w = sineh(src.w, pp); 1662 } 1663 tanh(Vector4f & dst,const Vector4f & src,bool pp)1664 void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp) 1665 { 1666 dst.x = tangenth(src.x, pp); 1667 dst.y = tangenth(src.y, pp); 1668 dst.z = tangenth(src.z, pp); 1669 dst.w = tangenth(src.w, pp); 1670 } 1671 acosh(Vector4f & dst,const Vector4f & src,bool pp)1672 void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp) 1673 { 1674 dst.x = arccosh(src.x, pp); 1675 dst.y = arccosh(src.y, pp); 1676 dst.z = arccosh(src.z, pp); 1677 dst.w = arccosh(src.w, pp); 1678 } 1679 asinh(Vector4f & dst,const Vector4f & src,bool pp)1680 void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp) 1681 { 1682 dst.x = arcsinh(src.x, pp); 1683 dst.y = arcsinh(src.y, pp); 1684 dst.z = arcsinh(src.z, pp); 1685 dst.w = arcsinh(src.w, pp); 1686 } 1687 atanh(Vector4f & dst,const Vector4f & src,bool pp)1688 void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp) 1689 { 1690 dst.x = arctanh(src.x, pp); 1691 dst.y = arctanh(src.y, pp); 1692 dst.z = arctanh(src.z, pp); 1693 dst.w = arctanh(src.w, pp); 1694 } 1695 expp(Vector4f & dst,const Vector4f & src,unsigned short shaderModel)1696 void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel) 1697 { 1698 if(shaderModel < 0x0200) 1699 { 1700 Float4 frc = Frac(src.x); 1701 Float4 floor = src.x - frc; 1702 1703 dst.x = exponential2(floor, true); 1704 dst.y = frc; 1705 dst.z = exponential2(src.x, true); 1706 dst.w = Float4(1.0f); 1707 } 1708 else // Version >= 2.0 1709 { 1710 exp2x(dst, src, true); // FIXME: 10-bit precision suffices 1711 } 1712 } 1713 logp(Vector4f & dst,const Vector4f & src,unsigned short shaderModel)1714 void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel) 1715 { 1716 if(shaderModel < 0x0200) 1717 { 1718 Float4 tmp0; 1719 Float4 tmp1; 1720 Float4 t; 1721 Int4 r; 1722 1723 tmp0 = Abs(src.x); 1724 tmp1 = tmp0; 1725 1726 // X component 1727 r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127); 1728 dst.x = Float4(r); 1729 1730 // Y component 1731 dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f))); 1732 1733 // Z component 1734 dst.z = logarithm2(src.x, true, true); 1735 1736 // W component 1737 dst.w = 1.0f; 1738 } 1739 else 1740 { 1741 log2x(dst, src, true); 1742 } 1743 } 1744 cmp0(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1745 void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1746 { 1747 cmp0(dst.x, src0.x, src1.x, src2.x); 1748 cmp0(dst.y, src0.y, src1.y, src2.y); 1749 cmp0(dst.z, src0.z, src1.z, src2.z); 1750 cmp0(dst.w, src0.w, src1.w, src2.w); 1751 } 1752 select(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1753 void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1754 { 1755 select(dst.x, As<Int4>(src0.x), src1.x, src2.x); 1756 select(dst.y, As<Int4>(src0.y), src1.y, src2.y); 1757 select(dst.z, As<Int4>(src0.z), src1.z, src2.z); 1758 select(dst.w, As<Int4>(src0.w), src1.w, src2.w); 1759 } 1760 extract(Float4 & dst,const Vector4f & src0,const Float4 & src1)1761 void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1) 1762 { 1763 select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x); 1764 select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst); 1765 select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst); 1766 } 1767 insert(Vector4f & dst,const Vector4f & src,const Float4 & element,const Float4 & index)1768 void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index) 1769 { 1770 select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x); 1771 select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y); 1772 select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z); 1773 select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w); 1774 } 1775 sgn(Float4 & dst,const Float4 & src)1776 void ShaderCore::sgn(Float4 &dst, const Float4 &src) 1777 { 1778 Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f)); 1779 Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f)); 1780 dst = As<Float4>(neg | pos); 1781 } 1782 isgn(Float4 & dst,const Float4 & src)1783 void ShaderCore::isgn(Float4 &dst, const Float4 &src) 1784 { 1785 Int4 neg = CmpLT(As<Int4>(src), Int4(0)) & Int4(-1); 1786 Int4 pos = CmpNLE(As<Int4>(src), Int4(0)) & Int4(1); 1787 dst = As<Float4>(neg | pos); 1788 } 1789 cmp0(Float4 & dst,const Float4 & src0,const Float4 & src1,const Float4 & src2)1790 void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2) 1791 { 1792 Int4 pos = CmpLE(Float4(0.0f), src0); 1793 select(dst, pos, src1, src2); 1794 } 1795 cmp0i(Float4 & dst,const Float4 & src0,const Float4 & src1,const Float4 & src2)1796 void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2) 1797 { 1798 Int4 pos = CmpEQ(Int4(0), As<Int4>(src0)); 1799 select(dst, pos, src1, src2); 1800 } 1801 select(Float4 & dst,RValue<Int4> src0,const Float4 & src1,const Float4 & src2)1802 void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2) 1803 { 1804 // FIXME: LLVM vector select 1805 dst = As<Float4>((src0 & As<Int4>(src1)) | (~src0 & As<Int4>(src2))); 1806 } 1807 cmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1808 void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control) 1809 { 1810 switch(control) 1811 { 1812 case Shader::CONTROL_GT: 1813 dst.x = As<Float4>(CmpNLE(src0.x, src1.x)); 1814 dst.y = As<Float4>(CmpNLE(src0.y, src1.y)); 1815 dst.z = As<Float4>(CmpNLE(src0.z, src1.z)); 1816 dst.w = As<Float4>(CmpNLE(src0.w, src1.w)); 1817 break; 1818 case Shader::CONTROL_EQ: 1819 dst.x = As<Float4>(CmpEQ(src0.x, src1.x)); 1820 dst.y = As<Float4>(CmpEQ(src0.y, src1.y)); 1821 dst.z = As<Float4>(CmpEQ(src0.z, src1.z)); 1822 dst.w = As<Float4>(CmpEQ(src0.w, src1.w)); 1823 break; 1824 case Shader::CONTROL_GE: 1825 dst.x = As<Float4>(CmpNLT(src0.x, src1.x)); 1826 dst.y = As<Float4>(CmpNLT(src0.y, src1.y)); 1827 dst.z = As<Float4>(CmpNLT(src0.z, src1.z)); 1828 dst.w = As<Float4>(CmpNLT(src0.w, src1.w)); 1829 break; 1830 case Shader::CONTROL_LT: 1831 dst.x = As<Float4>(CmpLT(src0.x, src1.x)); 1832 dst.y = As<Float4>(CmpLT(src0.y, src1.y)); 1833 dst.z = As<Float4>(CmpLT(src0.z, src1.z)); 1834 dst.w = As<Float4>(CmpLT(src0.w, src1.w)); 1835 break; 1836 case Shader::CONTROL_NE: 1837 dst.x = As<Float4>(CmpNEQ(src0.x, src1.x)); 1838 dst.y = As<Float4>(CmpNEQ(src0.y, src1.y)); 1839 dst.z = As<Float4>(CmpNEQ(src0.z, src1.z)); 1840 dst.w = As<Float4>(CmpNEQ(src0.w, src1.w)); 1841 break; 1842 case Shader::CONTROL_LE: 1843 dst.x = As<Float4>(CmpLE(src0.x, src1.x)); 1844 dst.y = As<Float4>(CmpLE(src0.y, src1.y)); 1845 dst.z = As<Float4>(CmpLE(src0.z, src1.z)); 1846 dst.w = As<Float4>(CmpLE(src0.w, src1.w)); 1847 break; 1848 default: 1849 ASSERT(false); 1850 } 1851 } 1852 icmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1853 void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control) 1854 { 1855 switch(control) 1856 { 1857 case Shader::CONTROL_GT: 1858 dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x))); 1859 dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y))); 1860 dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z))); 1861 dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w))); 1862 break; 1863 case Shader::CONTROL_EQ: 1864 dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x))); 1865 dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y))); 1866 dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z))); 1867 dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w))); 1868 break; 1869 case Shader::CONTROL_GE: 1870 dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x))); 1871 dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y))); 1872 dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z))); 1873 dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w))); 1874 break; 1875 case Shader::CONTROL_LT: 1876 dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x))); 1877 dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y))); 1878 dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z))); 1879 dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w))); 1880 break; 1881 case Shader::CONTROL_NE: 1882 dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x))); 1883 dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y))); 1884 dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z))); 1885 dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w))); 1886 break; 1887 case Shader::CONTROL_LE: 1888 dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x))); 1889 dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y))); 1890 dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z))); 1891 dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w))); 1892 break; 1893 default: 1894 ASSERT(false); 1895 } 1896 } 1897 ucmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1898 void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control) 1899 { 1900 switch(control) 1901 { 1902 case Shader::CONTROL_GT: 1903 dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1904 dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1905 dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1906 dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1907 break; 1908 case Shader::CONTROL_EQ: 1909 dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1910 dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1911 dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1912 dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1913 break; 1914 case Shader::CONTROL_GE: 1915 dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1916 dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1917 dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1918 dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1919 break; 1920 case Shader::CONTROL_LT: 1921 dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1922 dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1923 dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1924 dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1925 break; 1926 case Shader::CONTROL_NE: 1927 dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1928 dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1929 dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1930 dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1931 break; 1932 case Shader::CONTROL_LE: 1933 dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1934 dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1935 dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1936 dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1937 break; 1938 default: 1939 ASSERT(false); 1940 } 1941 } 1942 all(Float4 & dst,const Vector4f & src)1943 void ShaderCore::all(Float4 &dst, const Vector4f &src) 1944 { 1945 dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w)); 1946 } 1947 any(Float4 & dst,const Vector4f & src)1948 void ShaderCore::any(Float4 &dst, const Vector4f &src) 1949 { 1950 dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w)); 1951 } 1952 bitwise_not(Vector4f & dst,const Vector4f & src)1953 void ShaderCore::bitwise_not(Vector4f &dst, const Vector4f &src) 1954 { 1955 dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF)); 1956 dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF)); 1957 dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF)); 1958 dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF)); 1959 } 1960 bitwise_or(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1961 void ShaderCore::bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1962 { 1963 dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x)); 1964 dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y)); 1965 dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z)); 1966 dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w)); 1967 } 1968 bitwise_xor(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1969 void ShaderCore::bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1970 { 1971 dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x)); 1972 dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y)); 1973 dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z)); 1974 dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w)); 1975 } 1976 bitwise_and(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1977 void ShaderCore::bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1978 { 1979 dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x)); 1980 dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y)); 1981 dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z)); 1982 dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w)); 1983 } 1984 equal(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1985 void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1986 { 1987 dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) & 1988 CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) & 1989 CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) & 1990 CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1991 dst.y = dst.x; 1992 dst.z = dst.x; 1993 dst.w = dst.x; 1994 } 1995 notEqual(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1996 void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1997 { 1998 dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) | 1999 CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) | 2000 CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) | 2001 CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); 2002 dst.y = dst.x; 2003 dst.z = dst.x; 2004 dst.w = dst.x; 2005 } 2006 } 2007