1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "ShaderCore.hpp"
16 
17 #include "Renderer/Renderer.hpp"
18 #include "Common/Debug.hpp"
19 
20 #include <limits.h>
21 
22 namespace sw
23 {
24 	extern TranscendentalPrecision logPrecision;
25 	extern TranscendentalPrecision expPrecision;
26 	extern TranscendentalPrecision rcpPrecision;
27 	extern TranscendentalPrecision rsqPrecision;
28 
Vector4s()29 	Vector4s::Vector4s()
30 	{
31 	}
32 
Vector4s(unsigned short x,unsigned short y,unsigned short z,unsigned short w)33 	Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
34 	{
35 		this->x = Short4(x);
36 		this->y = Short4(y);
37 		this->z = Short4(z);
38 		this->w = Short4(w);
39 	}
40 
Vector4s(const Vector4s & rhs)41 	Vector4s::Vector4s(const Vector4s &rhs)
42 	{
43 		x = rhs.x;
44 		y = rhs.y;
45 		z = rhs.z;
46 		w = rhs.w;
47 	}
48 
operator =(const Vector4s & rhs)49 	Vector4s &Vector4s::operator=(const Vector4s &rhs)
50 	{
51 		x = rhs.x;
52 		y = rhs.y;
53 		z = rhs.z;
54 		w = rhs.w;
55 
56 		return *this;
57 	}
58 
operator [](int i)59 	Short4 &Vector4s::operator[](int i)
60 	{
61 		switch(i)
62 		{
63 		case 0: return x;
64 		case 1: return y;
65 		case 2: return z;
66 		case 3: return w;
67 		}
68 
69 		return x;
70 	}
71 
Vector4f()72 	Vector4f::Vector4f()
73 	{
74 	}
75 
Vector4f(float x,float y,float z,float w)76 	Vector4f::Vector4f(float x, float y, float z, float w)
77 	{
78 		this->x = Float4(x);
79 		this->y = Float4(y);
80 		this->z = Float4(z);
81 		this->w = Float4(w);
82 	}
83 
Vector4f(const Vector4f & rhs)84 	Vector4f::Vector4f(const Vector4f &rhs)
85 	{
86 		x = rhs.x;
87 		y = rhs.y;
88 		z = rhs.z;
89 		w = rhs.w;
90 	}
91 
operator =(const Vector4f & rhs)92 	Vector4f &Vector4f::operator=(const Vector4f &rhs)
93 	{
94 		x = rhs.x;
95 		y = rhs.y;
96 		z = rhs.z;
97 		w = rhs.w;
98 
99 		return *this;
100 	}
101 
operator [](int i)102 	Float4 &Vector4f::operator[](int i)
103 	{
104 		switch(i)
105 		{
106 		case 0: return x;
107 		case 1: return y;
108 		case 2: return z;
109 		case 3: return w;
110 		}
111 
112 		return x;
113 	}
114 
exponential2(RValue<Float4> x,bool pp)115 	Float4 exponential2(RValue<Float4> x, bool pp)
116 	{
117 		// This implementation is based on 2^(i + f) = 2^i * 2^f,
118 		// where i is the integer part of x and f is the fraction.
119 
120 		// For 2^i we can put the integer part directly in the exponent of
121 		// the IEEE-754 floating-point number. Clamp to prevent overflow
122 		// past the representation of infinity.
123 		Float4 x0 = x;
124 		x0 = Min(x0, As<Float4>(Int4(0x43010000)));   // 129.00000e+0f
125 		x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF)));   // -126.99999e+0f
126 
127 		Int4 i = RoundInt(x0 - Float4(0.5f));
128 		Float4 ii = As<Float4>((i + Int4(127)) << 23);   // Add single-precision bias, and shift into exponent.
129 
130 		// For the fractional part use a polynomial
131 		// which approximates 2^f in the 0 to 1 range.
132 		Float4 f = x0 - Float4(i);
133 		Float4 ff = As<Float4>(Int4(0x3AF61905));     // 1.8775767e-3f
134 		ff = ff * f + As<Float4>(Int4(0x3C134806));   // 8.9893397e-3f
135 		ff = ff * f + As<Float4>(Int4(0x3D64AA23));   // 5.5826318e-2f
136 		ff = ff * f + As<Float4>(Int4(0x3E75EAD4));   // 2.4015361e-1f
137 		ff = ff * f + As<Float4>(Int4(0x3F31727B));   // 6.9315308e-1f
138 		ff = ff * f + Float4(1.0f);
139 
140 		return ii * ff;
141 	}
142 
logarithm2(RValue<Float4> x,bool absolute,bool pp)143 	Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp)
144 	{
145 		Float4 x0;
146 		Float4 x1;
147 		Float4 x2;
148 		Float4 x3;
149 
150 		x0 = x;
151 
152 		x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
153 		x1 = As<Float4>(As<UInt4>(x1) >> 8);
154 		x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
155 		x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f);   // FIXME: (x1 - 1.4960938f) * 256.0f;
156 		x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
157 
158 		x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
159 		x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
160 		x2 /= x3;
161 
162 		x1 += (x0 - Float4(1.0f)) * x2;
163 
164 		Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000));
165 		return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1)));
166 	}
167 
exponential(RValue<Float4> x,bool pp)168 	Float4 exponential(RValue<Float4> x, bool pp)
169 	{
170 		// FIXME: Propagate the constant
171 		return exponential2(Float4(1.44269504f) * x, pp);   // 1/ln(2)
172 	}
173 
logarithm(RValue<Float4> x,bool absolute,bool pp)174 	Float4 logarithm(RValue<Float4> x, bool absolute, bool pp)
175 	{
176 		// FIXME: Propagate the constant
177 		return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp);   // ln(2)
178 	}
179 
power(RValue<Float4> x,RValue<Float4> y,bool pp)180 	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
181 	{
182 		Float4 log = logarithm2(x, true, pp);
183 		log *= y;
184 		return exponential2(log, pp);
185 	}
186 
reciprocal(RValue<Float4> x,bool pp,bool finite,bool exactAtPow2)187 	Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
188 	{
189 		Float4 rcp;
190 
191 		if(!pp && rcpPrecision >= WHQL)
192 		{
193 			rcp = Float4(1.0f) / x;
194 		}
195 		else
196 		{
197 			rcp = Rcp_pp(x, exactAtPow2);
198 
199 			if(!pp)
200 			{
201 				rcp = (rcp + rcp) - (x * rcp * rcp);
202 			}
203 		}
204 
205 		if(finite)
206 		{
207 			int big = 0x7F7FFFFF;
208 			rcp = Min(rcp, Float4((float&)big));
209 		}
210 
211 		return rcp;
212 	}
213 
reciprocalSquareRoot(RValue<Float4> x,bool absolute,bool pp)214 	Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
215 	{
216 		Float4 abs = x;
217 
218 		if(absolute)
219 		{
220 			abs = Abs(abs);
221 		}
222 
223 		Float4 rsq;
224 
225 		if(!pp)
226 		{
227 			rsq = Float4(1.0f) / Sqrt(abs);
228 		}
229 		else
230 		{
231 			rsq = RcpSqrt_pp(abs);
232 
233 			if(!pp)
234 			{
235 				rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
236 			}
237 
238 			rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
239 		}
240 
241 		return rsq;
242 	}
243 
modulo(RValue<Float4> x,RValue<Float4> y)244 	Float4 modulo(RValue<Float4> x, RValue<Float4> y)
245 	{
246 		return x - y * Floor(x / y);
247 	}
248 
sine_pi(RValue<Float4> x,bool pp)249 	Float4 sine_pi(RValue<Float4> x, bool pp)
250 	{
251 		const Float4 A = Float4(-4.05284734e-1f);   // -4/pi^2
252 		const Float4 B = Float4(1.27323954e+0f);    // 4/pi
253 		const Float4 C = Float4(7.75160950e-1f);
254 		const Float4 D = Float4(2.24839049e-1f);
255 
256 		// Parabola approximating sine
257 		Float4 sin = x * (Abs(x) * A + B);
258 
259 		// Improve precision from 0.06 to 0.001
260 		if(true)
261 		{
262 			sin = sin * (Abs(sin) * D + C);
263 		}
264 
265 		return sin;
266 	}
267 
cosine_pi(RValue<Float4> x,bool pp)268 	Float4 cosine_pi(RValue<Float4> x, bool pp)
269 	{
270 		// cos(x) = sin(x + pi/2)
271 		Float4 y = x + Float4(1.57079632e+0f);
272 
273 		// Wrap around
274 		y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
275 
276 		return sine_pi(y, pp);
277 	}
278 
sine(RValue<Float4> x,bool pp)279 	Float4 sine(RValue<Float4> x, bool pp)
280 	{
281 		// Reduce to [-0.5, 0.5] range
282 		Float4 y = x * Float4(1.59154943e-1f);   // 1/2pi
283 		y = y - Round(y);
284 
285 		if(!pp)
286 		{
287 			// From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
288 			// This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
289 			// !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
290 			//  pp : 4 mul, 2 add, 2 abs
291 
292 			Float4 y2 = y * y;
293 			Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f);
294 			Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f));
295 			Float4 c2 = (c1 * c1) - (s1 * s1);
296 			Float4 s2 = Float4(2.0f) * s1 * c1;
297 			return Float4(2.0f) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true);
298 		}
299 
300 		const Float4 A = Float4(-16.0f);
301 		const Float4 B = Float4(8.0f);
302 		const Float4 C = Float4(7.75160950e-1f);
303 		const Float4 D = Float4(2.24839049e-1f);
304 
305 		// Parabola approximating sine
306 		Float4 sin = y * (Abs(y) * A + B);
307 
308 		// Improve precision from 0.06 to 0.001
309 		if(true)
310 		{
311 			sin = sin * (Abs(sin) * D + C);
312 		}
313 
314 		return sin;
315 	}
316 
cosine(RValue<Float4> x,bool pp)317 	Float4 cosine(RValue<Float4> x, bool pp)
318 	{
319 		// cos(x) = sin(x + pi/2)
320 		Float4 y = x + Float4(1.57079632e+0f);
321 		return sine(y, pp);
322 	}
323 
tangent(RValue<Float4> x,bool pp)324 	Float4 tangent(RValue<Float4> x, bool pp)
325 	{
326 		return sine(x, pp) / cosine(x, pp);
327 	}
328 
arccos(RValue<Float4> x,bool pp)329 	Float4 arccos(RValue<Float4> x, bool pp)
330 	{
331 		// pi/2 - arcsin(x)
332 		return Float4(1.57079632e+0f) - arcsin(x);
333 	}
334 
arcsin(RValue<Float4> x,bool pp)335 	Float4 arcsin(RValue<Float4> x, bool pp)
336 	{
337 		if(false) // Simpler implementation fails even lowp precision tests
338 		{
339 			// x*(pi/2-sqrt(1-x*x)*pi/5)
340 			return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
341 		}
342 		else
343 		{
344 			// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
345 			const Float4 half_pi(1.57079632f);
346 			const Float4 a0(1.5707288f);
347 			const Float4 a1(-0.2121144f);
348 			const Float4 a2(0.0742610f);
349 			const Float4 a3(-0.0187293f);
350 			Float4 absx = Abs(x);
351 			return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
352 			       (As<Int4>(x) & Int4(0x80000000)));
353 		}
354 	}
355 
356 	// Approximation of atan in [0..1]
arctan_01(Float4 x,bool pp)357 	Float4 arctan_01(Float4 x, bool pp)
358 	{
359 		if(pp)
360 		{
361 			return x * (Float4(-0.27f) * x + Float4(1.05539816f));
362 		}
363 		else
364 		{
365 			// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
366 			const Float4 a2(-0.3333314528f);
367 			const Float4 a4(0.1999355085f);
368 			const Float4 a6(-0.1420889944f);
369 			const Float4 a8(0.1065626393f);
370 			const Float4 a10(-0.0752896400f);
371 			const Float4 a12(0.0429096138f);
372 			const Float4 a14(-0.0161657367f);
373 			const Float4 a16(0.0028662257f);
374 			Float4 x2 = x * x;
375 			return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
376 		}
377 	}
378 
arctan(RValue<Float4> x,bool pp)379 	Float4 arctan(RValue<Float4> x, bool pp)
380 	{
381 		Float4 absx = Abs(x);
382 		Int4 O = CmpNLT(absx, Float4(1.0f));
383 		Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select
384 
385 		const Float4 half_pi(1.57079632f);
386 		Float4 theta = arctan_01(y, pp);
387 		return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select
388 		       (As<Int4>(x) & Int4(0x80000000)));
389 	}
390 
arctan(RValue<Float4> y,RValue<Float4> x,bool pp)391 	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
392 	{
393 		const Float4 pi(3.14159265f);            // pi
394 		const Float4 minus_pi(-3.14159265f);     // -pi
395 		const Float4 half_pi(1.57079632f);       // pi/2
396 		const Float4 quarter_pi(7.85398163e-1f); // pi/4
397 
398 		// Rotate to upper semicircle when in lower semicircle
399 		Int4 S = CmpLT(y, Float4(0.0f));
400 		Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
401 		Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
402 		Float4 y0 = Abs(y);
403 
404 		// Rotate to right quadrant when in left quadrant
405 		Int4 Q = CmpLT(x0, Float4(0.0f));
406 		theta += As<Float4>(Q & As<Int4>(half_pi));
407 		Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0)));  // FIXME: Vector select
408 		Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select
409 
410 		// Mirror to first octant when in second octant
411 		Int4 O = CmpNLT(y1, x1);
412 		Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select
413 		Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select
414 
415 		// Approximation of atan in [0..1]
416 		Int4 zero_x = CmpEQ(x2, Float4(0.0f));
417 		Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
418 		Float4 atan2_theta = arctan_01(y2 / x2, pp);
419 		theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select
420 		                    (inf_y & As<Int4>(quarter_pi)));
421 
422 		// Recover loss of precision for tiny theta angles
423 		Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
424 		return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
425 	}
426 
sineh(RValue<Float4> x,bool pp)427 	Float4 sineh(RValue<Float4> x, bool pp)
428 	{
429 		return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f);
430 	}
431 
cosineh(RValue<Float4> x,bool pp)432 	Float4 cosineh(RValue<Float4> x, bool pp)
433 	{
434 		return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f);
435 	}
436 
tangenth(RValue<Float4> x,bool pp)437 	Float4 tangenth(RValue<Float4> x, bool pp)
438 	{
439 		Float4 e_x = exponential(x, pp);
440 		Float4 e_minus_x = exponential(-x, pp);
441 		return (e_x - e_minus_x) / (e_x + e_minus_x);
442 	}
443 
arccosh(RValue<Float4> x,bool pp)444 	Float4 arccosh(RValue<Float4> x, bool pp)
445 	{
446 		return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp);
447 	}
448 
arcsinh(RValue<Float4> x,bool pp)449 	Float4 arcsinh(RValue<Float4> x, bool pp)
450 	{
451 		return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp);
452 	}
453 
arctanh(RValue<Float4> x,bool pp)454 	Float4 arctanh(RValue<Float4> x, bool pp)
455 	{
456 		return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f);
457 	}
458 
dot2(const Vector4f & v0,const Vector4f & v1)459 	Float4 dot2(const Vector4f &v0, const Vector4f &v1)
460 	{
461 		return v0.x * v1.x + v0.y * v1.y;
462 	}
463 
dot3(const Vector4f & v0,const Vector4f & v1)464 	Float4 dot3(const Vector4f &v0, const Vector4f &v1)
465 	{
466 		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
467 	}
468 
dot4(const Vector4f & v0,const Vector4f & v1)469 	Float4 dot4(const Vector4f &v0, const Vector4f &v1)
470 	{
471 		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
472 	}
473 
transpose4x4(Short4 & row0,Short4 & row1,Short4 & row2,Short4 & row3)474 	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
475 	{
476 		Int2 tmp0 = UnpackHigh(row0, row1);
477 		Int2 tmp1 = UnpackHigh(row2, row3);
478 		Int2 tmp2 = UnpackLow(row0, row1);
479 		Int2 tmp3 = UnpackLow(row2, row3);
480 
481 		row0 = UnpackLow(tmp2, tmp3);
482 		row1 = UnpackHigh(tmp2, tmp3);
483 		row2 = UnpackLow(tmp0, tmp1);
484 		row3 = UnpackHigh(tmp0, tmp1);
485 	}
486 
transpose4x3(Short4 & row0,Short4 & row1,Short4 & row2,Short4 & row3)487 	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
488 	{
489 		Int2 tmp0 = UnpackHigh(row0, row1);
490 		Int2 tmp1 = UnpackHigh(row2, row3);
491 		Int2 tmp2 = UnpackLow(row0, row1);
492 		Int2 tmp3 = UnpackLow(row2, row3);
493 
494 		row0 = UnpackLow(tmp2, tmp3);
495 		row1 = UnpackHigh(tmp2, tmp3);
496 		row2 = UnpackLow(tmp0, tmp1);
497 	}
498 
transpose4x4(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)499 	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
500 	{
501 		Float4 tmp0 = UnpackLow(row0, row1);
502 		Float4 tmp1 = UnpackLow(row2, row3);
503 		Float4 tmp2 = UnpackHigh(row0, row1);
504 		Float4 tmp3 = UnpackHigh(row2, row3);
505 
506 		row0 = Float4(tmp0.xy, tmp1.xy);
507 		row1 = Float4(tmp0.zw, tmp1.zw);
508 		row2 = Float4(tmp2.xy, tmp3.xy);
509 		row3 = Float4(tmp2.zw, tmp3.zw);
510 	}
511 
transpose4x3(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)512 	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
513 	{
514 		Float4 tmp0 = UnpackLow(row0, row1);
515 		Float4 tmp1 = UnpackLow(row2, row3);
516 		Float4 tmp2 = UnpackHigh(row0, row1);
517 		Float4 tmp3 = UnpackHigh(row2, row3);
518 
519 		row0 = Float4(tmp0.xy, tmp1.xy);
520 		row1 = Float4(tmp0.zw, tmp1.zw);
521 		row2 = Float4(tmp2.xy, tmp3.xy);
522 	}
523 
transpose4x2(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)524 	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
525 	{
526 		Float4 tmp0 = UnpackLow(row0, row1);
527 		Float4 tmp1 = UnpackLow(row2, row3);
528 
529 		row0 = Float4(tmp0.xy, tmp1.xy);
530 		row1 = Float4(tmp0.zw, tmp1.zw);
531 	}
532 
transpose4x1(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)533 	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
534 	{
535 		Float4 tmp0 = UnpackLow(row0, row1);
536 		Float4 tmp1 = UnpackLow(row2, row3);
537 
538 		row0 = Float4(tmp0.xy, tmp1.xy);
539 	}
540 
transpose2x4(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)541 	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
542 	{
543 		Float4 tmp01 = UnpackLow(row0, row1);
544 		Float4 tmp23 = UnpackHigh(row0, row1);
545 
546 		row0 = tmp01;
547 		row1 = Float4(tmp01.zw, row1.zw);
548 		row2 = tmp23;
549 		row3 = Float4(tmp23.zw, row3.zw);
550 	}
551 
transpose4xN(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3,int N)552 	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
553 	{
554 		switch(N)
555 		{
556 		case 1: transpose4x1(row0, row1, row2, row3); break;
557 		case 2: transpose4x2(row0, row1, row2, row3); break;
558 		case 3: transpose4x3(row0, row1, row2, row3); break;
559 		case 4: transpose4x4(row0, row1, row2, row3); break;
560 		}
561 	}
562 
operator [](RValue<Int4> index)563 	const Vector4f RegisterFile::operator[](RValue<Int4> index)
564 	{
565 		ASSERT(indirectAddressable);
566 
567 		Int index0 = Extract(index, 0);
568 		Int index1 = Extract(index, 1);
569 		Int index2 = Extract(index, 2);
570 		Int index3 = Extract(index, 3);
571 
572 		Vector4f r;
573 
574 		r.x.x = Extract(x[0][index0], 0);
575 		r.x.y = Extract(x[0][index1], 1);
576 		r.x.z = Extract(x[0][index2], 2);
577 		r.x.w = Extract(x[0][index3], 3);
578 
579 		r.y.x = Extract(y[0][index0], 0);
580 		r.y.y = Extract(y[0][index1], 1);
581 		r.y.z = Extract(y[0][index2], 2);
582 		r.y.w = Extract(y[0][index3], 3);
583 
584 		r.z.x = Extract(z[0][index0], 0);
585 		r.z.y = Extract(z[0][index1], 1);
586 		r.z.z = Extract(z[0][index2], 2);
587 		r.z.w = Extract(z[0][index3], 3);
588 
589 		r.w.x = Extract(w[0][index0], 0);
590 		r.w.y = Extract(w[0][index1], 1);
591 		r.w.z = Extract(w[0][index2], 2);
592 		r.w.w = Extract(w[0][index3], 3);
593 
594 		return r;
595 	}
596 
scatter_x(Int4 index,RValue<Float4> r)597 	void RegisterFile::scatter_x(Int4 index, RValue<Float4> r)
598 	{
599 		ASSERT(indirectAddressable);
600 
601 		Int index0 = Extract(index, 0);
602 		Int index1 = Extract(index, 1);
603 		Int index2 = Extract(index, 2);
604 		Int index3 = Extract(index, 3);
605 
606 		x[0][index0] = Insert(x[0][index0], Extract(r, 0), 0);
607 		x[0][index1] = Insert(x[0][index1], Extract(r, 1), 1);
608 		x[0][index2] = Insert(x[0][index2], Extract(r, 2), 2);
609 		x[0][index3] = Insert(x[0][index3], Extract(r, 3), 3);
610 	}
611 
scatter_y(Int4 index,RValue<Float4> r)612 	void RegisterFile::scatter_y(Int4 index, RValue<Float4> r)
613 	{
614 		ASSERT(indirectAddressable);
615 
616 		Int index0 = Extract(index, 0);
617 		Int index1 = Extract(index, 1);
618 		Int index2 = Extract(index, 2);
619 		Int index3 = Extract(index, 3);
620 
621 		y[0][index0] = Insert(y[0][index0], Extract(r, 0), 0);
622 		y[0][index1] = Insert(y[0][index1], Extract(r, 1), 1);
623 		y[0][index2] = Insert(y[0][index2], Extract(r, 2), 2);
624 		y[0][index3] = Insert(y[0][index3], Extract(r, 3), 3);
625 	}
626 
scatter_z(Int4 index,RValue<Float4> r)627 	void RegisterFile::scatter_z(Int4 index, RValue<Float4> r)
628 	{
629 		ASSERT(indirectAddressable);
630 
631 		Int index0 = Extract(index, 0);
632 		Int index1 = Extract(index, 1);
633 		Int index2 = Extract(index, 2);
634 		Int index3 = Extract(index, 3);
635 
636 		z[0][index0] = Insert(z[0][index0], Extract(r, 0), 0);
637 		z[0][index1] = Insert(z[0][index1], Extract(r, 1), 1);
638 		z[0][index2] = Insert(z[0][index2], Extract(r, 2), 2);
639 		z[0][index3] = Insert(z[0][index3], Extract(r, 3), 3);
640 	}
641 
scatter_w(Int4 index,RValue<Float4> r)642 	void RegisterFile::scatter_w(Int4 index, RValue<Float4> r)
643 	{
644 		ASSERT(indirectAddressable);
645 
646 		Int index0 = Extract(index, 0);
647 		Int index1 = Extract(index, 1);
648 		Int index2 = Extract(index, 2);
649 		Int index3 = Extract(index, 3);
650 
651 		w[0][index0] = Insert(w[0][index0], Extract(r, 0), 0);
652 		w[0][index1] = Insert(w[0][index1], Extract(r, 1), 1);
653 		w[0][index2] = Insert(w[0][index2], Extract(r, 2), 2);
654 		w[0][index3] = Insert(w[0][index3], Extract(r, 3), 3);
655 	}
656 
mov(Vector4f & dst,const Vector4f & src,bool integerDestination)657 	void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination)
658 	{
659 		if(integerDestination)
660 		{
661 			dst.x = As<Float4>(RoundInt(src.x));
662 			dst.y = As<Float4>(RoundInt(src.y));
663 			dst.z = As<Float4>(RoundInt(src.z));
664 			dst.w = As<Float4>(RoundInt(src.w));
665 		}
666 		else
667 		{
668 			dst = src;
669 		}
670 	}
671 
neg(Vector4f & dst,const Vector4f & src)672 	void ShaderCore::neg(Vector4f &dst, const Vector4f &src)
673 	{
674 		dst.x = -src.x;
675 		dst.y = -src.y;
676 		dst.z = -src.z;
677 		dst.w = -src.w;
678 	}
679 
ineg(Vector4f & dst,const Vector4f & src)680 	void ShaderCore::ineg(Vector4f &dst, const Vector4f &src)
681 	{
682 		dst.x = As<Float4>(-As<Int4>(src.x));
683 		dst.y = As<Float4>(-As<Int4>(src.y));
684 		dst.z = As<Float4>(-As<Int4>(src.z));
685 		dst.w = As<Float4>(-As<Int4>(src.w));
686 	}
687 
f2b(Vector4f & dst,const Vector4f & src)688 	void ShaderCore::f2b(Vector4f &dst, const Vector4f &src)
689 	{
690 		dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f)));
691 		dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f)));
692 		dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f)));
693 		dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f)));
694 	}
695 
b2f(Vector4f & dst,const Vector4f & src)696 	void ShaderCore::b2f(Vector4f &dst, const Vector4f &src)
697 	{
698 		dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f)));
699 		dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f)));
700 		dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f)));
701 		dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f)));
702 	}
703 
f2i(Vector4f & dst,const Vector4f & src)704 	void ShaderCore::f2i(Vector4f &dst, const Vector4f &src)
705 	{
706 		dst.x = As<Float4>(Int4(src.x));
707 		dst.y = As<Float4>(Int4(src.y));
708 		dst.z = As<Float4>(Int4(src.z));
709 		dst.w = As<Float4>(Int4(src.w));
710 	}
711 
i2f(Vector4f & dst,const Vector4f & src)712 	void ShaderCore::i2f(Vector4f &dst, const Vector4f &src)
713 	{
714 		dst.x = Float4(As<Int4>(src.x));
715 		dst.y = Float4(As<Int4>(src.y));
716 		dst.z = Float4(As<Int4>(src.z));
717 		dst.w = Float4(As<Int4>(src.w));
718 	}
719 
f2u(Vector4f & dst,const Vector4f & src)720 	void ShaderCore::f2u(Vector4f &dst, const Vector4f &src)
721 	{
722 		dst.x = As<Float4>(UInt4(src.x));
723 		dst.y = As<Float4>(UInt4(src.y));
724 		dst.z = As<Float4>(UInt4(src.z));
725 		dst.w = As<Float4>(UInt4(src.w));
726 	}
727 
u2f(Vector4f & dst,const Vector4f & src)728 	void ShaderCore::u2f(Vector4f &dst, const Vector4f &src)
729 	{
730 		dst.x = Float4(As<UInt4>(src.x));
731 		dst.y = Float4(As<UInt4>(src.y));
732 		dst.z = Float4(As<UInt4>(src.z));
733 		dst.w = Float4(As<UInt4>(src.w));
734 	}
735 
i2b(Vector4f & dst,const Vector4f & src)736 	void ShaderCore::i2b(Vector4f &dst, const Vector4f &src)
737 	{
738 		dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0)));
739 		dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0)));
740 		dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0)));
741 		dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0)));
742 	}
743 
b2i(Vector4f & dst,const Vector4f & src)744 	void ShaderCore::b2i(Vector4f &dst, const Vector4f &src)
745 	{
746 		dst.x = As<Float4>(As<Int4>(src.x) & Int4(1));
747 		dst.y = As<Float4>(As<Int4>(src.y) & Int4(1));
748 		dst.z = As<Float4>(As<Int4>(src.z) & Int4(1));
749 		dst.w = As<Float4>(As<Int4>(src.w) & Int4(1));
750 	}
751 
add(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)752 	void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
753 	{
754 		dst.x = src0.x + src1.x;
755 		dst.y = src0.y + src1.y;
756 		dst.z = src0.z + src1.z;
757 		dst.w = src0.w + src1.w;
758 	}
759 
iadd(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)760 	void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
761 	{
762 		dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x));
763 		dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y));
764 		dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z));
765 		dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w));
766 	}
767 
sub(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)768 	void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
769 	{
770 		dst.x = src0.x - src1.x;
771 		dst.y = src0.y - src1.y;
772 		dst.z = src0.z - src1.z;
773 		dst.w = src0.w - src1.w;
774 	}
775 
isub(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)776 	void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
777 	{
778 		dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x));
779 		dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y));
780 		dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z));
781 		dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w));
782 	}
783 
mad(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)784 	void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
785 	{
786 		dst.x = src0.x * src1.x + src2.x;
787 		dst.y = src0.y * src1.y + src2.y;
788 		dst.z = src0.z * src1.z + src2.z;
789 		dst.w = src0.w * src1.w + src2.w;
790 	}
791 
imad(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)792 	void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
793 	{
794 		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x));
795 		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y));
796 		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z));
797 		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w));
798 	}
799 
mul(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)800 	void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
801 	{
802 		dst.x = src0.x * src1.x;
803 		dst.y = src0.y * src1.y;
804 		dst.z = src0.z * src1.z;
805 		dst.w = src0.w * src1.w;
806 	}
807 
imul(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)808 	void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
809 	{
810 		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x));
811 		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y));
812 		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z));
813 		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w));
814 	}
815 
rcpx(Vector4f & dst,const Vector4f & src,bool pp)816 	void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp)
817 	{
818 		Float4 rcp = reciprocal(src.x, pp, true, true);
819 
820 		dst.x = rcp;
821 		dst.y = rcp;
822 		dst.z = rcp;
823 		dst.w = rcp;
824 	}
825 
div(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)826 	void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
827 	{
828 		dst.x = src0.x / src1.x;
829 		dst.y = src0.y / src1.y;
830 		dst.z = src0.z / src1.z;
831 		dst.w = src0.w / src1.w;
832 	}
833 
idiv(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)834 	void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
835 	{
836 		Float4 intMax(As<Float4>(Int4(INT_MAX)));
837 		cmp0i(dst.x, src1.x, intMax, src1.x);
838 		dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x));
839 		cmp0i(dst.y, src1.y, intMax, src1.y);
840 		dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y));
841 		cmp0i(dst.z, src1.z, intMax, src1.z);
842 		dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z));
843 		cmp0i(dst.w, src1.w, intMax, src1.w);
844 		dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w));
845 	}
846 
udiv(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)847 	void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
848 	{
849 		Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
850 		cmp0i(dst.x, src1.x, uintMax, src1.x);
851 		dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x));
852 		cmp0i(dst.y, src1.y, uintMax, src1.y);
853 		dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y));
854 		cmp0i(dst.z, src1.z, uintMax, src1.z);
855 		dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z));
856 		cmp0i(dst.w, src1.w, uintMax, src1.w);
857 		dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w));
858 	}
859 
mod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)860 	void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
861 	{
862 		dst.x = modulo(src0.x, src1.x);
863 		dst.y = modulo(src0.y, src1.y);
864 		dst.z = modulo(src0.z, src1.z);
865 		dst.w = modulo(src0.w, src1.w);
866 	}
867 
imod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)868 	void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
869 	{
870 		Float4 intMax(As<Float4>(Int4(INT_MAX)));
871 		cmp0i(dst.x, src1.x, intMax, src1.x);
872 		dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x));
873 		cmp0i(dst.y, src1.y, intMax, src1.y);
874 		dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y));
875 		cmp0i(dst.z, src1.z, intMax, src1.z);
876 		dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z));
877 		cmp0i(dst.w, src1.w, intMax, src1.w);
878 		dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w));
879 	}
880 
umod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)881 	void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
882 	{
883 		Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
884 		cmp0i(dst.x, src1.x, uintMax, src1.x);
885 		dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x));
886 		cmp0i(dst.y, src1.y, uintMax, src1.y);
887 		dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y));
888 		cmp0i(dst.z, src1.z, uintMax, src1.z);
889 		dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z));
890 		cmp0i(dst.w, src1.w, uintMax, src1.w);
891 		dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w));
892 	}
893 
shl(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)894 	void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
895 	{
896 		dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x));
897 		dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y));
898 		dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z));
899 		dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w));
900 	}
901 
ishr(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)902 	void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
903 	{
904 		dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x));
905 		dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y));
906 		dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z));
907 		dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w));
908 	}
909 
ushr(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)910 	void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
911 	{
912 		dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x));
913 		dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y));
914 		dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z));
915 		dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w));
916 	}
917 
rsqx(Vector4f & dst,const Vector4f & src,bool pp)918 	void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp)
919 	{
920 		Float4 rsq = reciprocalSquareRoot(src.x, true, pp);
921 
922 		dst.x = rsq;
923 		dst.y = rsq;
924 		dst.z = rsq;
925 		dst.w = rsq;
926 	}
927 
sqrt(Vector4f & dst,const Vector4f & src,bool pp)928 	void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp)
929 	{
930 		dst.x = Sqrt(src.x);
931 		dst.y = Sqrt(src.y);
932 		dst.z = Sqrt(src.z);
933 		dst.w = Sqrt(src.w);
934 	}
935 
rsq(Vector4f & dst,const Vector4f & src,bool pp)936 	void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp)
937 	{
938 		dst.x = reciprocalSquareRoot(src.x, false, pp);
939 		dst.y = reciprocalSquareRoot(src.y, false, pp);
940 		dst.z = reciprocalSquareRoot(src.z, false, pp);
941 		dst.w = reciprocalSquareRoot(src.w, false, pp);
942 	}
943 
len2(Float4 & dst,const Vector4f & src,bool pp)944 	void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp)
945 	{
946 		dst = Sqrt(dot2(src, src));
947 	}
948 
len3(Float4 & dst,const Vector4f & src,bool pp)949 	void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp)
950 	{
951 		dst = Sqrt(dot3(src, src));
952 	}
953 
len4(Float4 & dst,const Vector4f & src,bool pp)954 	void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp)
955 	{
956 		dst = Sqrt(dot4(src, src));
957 	}
958 
dist1(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)959 	void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
960 	{
961 		dst = Abs(src0.x - src1.x);
962 	}
963 
dist2(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)964 	void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
965 	{
966 		Float4 dx = src0.x - src1.x;
967 		Float4 dy = src0.y - src1.y;
968 		Float4 dot2 = dx * dx + dy * dy;
969 		dst = Sqrt(dot2);
970 	}
971 
dist3(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)972 	void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
973 	{
974 		Float4 dx = src0.x - src1.x;
975 		Float4 dy = src0.y - src1.y;
976 		Float4 dz = src0.z - src1.z;
977 		Float4 dot3 = dx * dx + dy * dy + dz * dz;
978 		dst = Sqrt(dot3);
979 	}
980 
dist4(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)981 	void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
982 	{
983 		Float4 dx = src0.x - src1.x;
984 		Float4 dy = src0.y - src1.y;
985 		Float4 dz = src0.z - src1.z;
986 		Float4 dw = src0.w - src1.w;
987 		Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw;
988 		dst = Sqrt(dot4);
989 	}
990 
dp1(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)991 	void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
992 	{
993 		Float4 t = src0.x * src1.x;
994 
995 		dst.x = t;
996 		dst.y = t;
997 		dst.z = t;
998 		dst.w = t;
999 	}
1000 
dp2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1001 	void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1002 	{
1003 		Float4 t = dot2(src0, src1);
1004 
1005 		dst.x = t;
1006 		dst.y = t;
1007 		dst.z = t;
1008 		dst.w = t;
1009 	}
1010 
dp2add(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1011 	void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1012 	{
1013 		Float4 t = dot2(src0, src1) + src2.x;
1014 
1015 		dst.x = t;
1016 		dst.y = t;
1017 		dst.z = t;
1018 		dst.w = t;
1019 	}
1020 
dp3(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1021 	void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1022 	{
1023 		Float4 dot = dot3(src0, src1);
1024 
1025 		dst.x = dot;
1026 		dst.y = dot;
1027 		dst.z = dot;
1028 		dst.w = dot;
1029 	}
1030 
dp4(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1031 	void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1032 	{
1033 		Float4 dot = dot4(src0, src1);
1034 
1035 		dst.x = dot;
1036 		dst.y = dot;
1037 		dst.z = dot;
1038 		dst.w = dot;
1039 	}
1040 
min(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1041 	void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1042 	{
1043 		dst.x = Min(src0.x, src1.x);
1044 		dst.y = Min(src0.y, src1.y);
1045 		dst.z = Min(src0.z, src1.z);
1046 		dst.w = Min(src0.w, src1.w);
1047 	}
1048 
imin(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1049 	void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1050 	{
1051 		dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x)));
1052 		dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y)));
1053 		dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z)));
1054 		dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w)));
1055 	}
1056 
umin(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1057 	void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1058 	{
1059 		dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1060 		dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1061 		dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1062 		dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1063 	}
1064 
max(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1065 	void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1066 	{
1067 		dst.x = Max(src0.x, src1.x);
1068 		dst.y = Max(src0.y, src1.y);
1069 		dst.z = Max(src0.z, src1.z);
1070 		dst.w = Max(src0.w, src1.w);
1071 	}
1072 
imax(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1073 	void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1074 	{
1075 		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
1076 		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
1077 		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
1078 		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
1079 	}
1080 
umax(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1081 	void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1082 	{
1083 		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
1084 		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
1085 		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
1086 		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
1087 	}
1088 
slt(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1089 	void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1090 	{
1091 		dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f)));
1092 		dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f)));
1093 		dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f)));
1094 		dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f)));
1095 	}
1096 
step(Vector4f & dst,const Vector4f & edge,const Vector4f & x)1097 	void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x)
1098 	{
1099 		dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f)));
1100 		dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f)));
1101 		dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f)));
1102 		dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f)));
1103 	}
1104 
exp2x(Vector4f & dst,const Vector4f & src,bool pp)1105 	void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp)
1106 	{
1107 		Float4 exp = exponential2(src.x, pp);
1108 
1109 		dst.x = exp;
1110 		dst.y = exp;
1111 		dst.z = exp;
1112 		dst.w = exp;
1113 	}
1114 
exp2(Vector4f & dst,const Vector4f & src,bool pp)1115 	void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp)
1116 	{
1117 		dst.x = exponential2(src.x, pp);
1118 		dst.y = exponential2(src.y, pp);
1119 		dst.z = exponential2(src.z, pp);
1120 		dst.w = exponential2(src.w, pp);
1121 	}
1122 
exp(Vector4f & dst,const Vector4f & src,bool pp)1123 	void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp)
1124 	{
1125 		dst.x = exponential(src.x, pp);
1126 		dst.y = exponential(src.y, pp);
1127 		dst.z = exponential(src.z, pp);
1128 		dst.w = exponential(src.w, pp);
1129 	}
1130 
log2x(Vector4f & dst,const Vector4f & src,bool pp)1131 	void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp)
1132 	{
1133 		Float4 log = logarithm2(src.x, true, pp);
1134 
1135 		dst.x = log;
1136 		dst.y = log;
1137 		dst.z = log;
1138 		dst.w = log;
1139 	}
1140 
log2(Vector4f & dst,const Vector4f & src,bool pp)1141 	void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp)
1142 	{
1143 		dst.x = logarithm2(src.x, false, pp);
1144 		dst.y = logarithm2(src.y, false, pp);
1145 		dst.z = logarithm2(src.z, false, pp);
1146 		dst.w = logarithm2(src.w, false, pp);
1147 	}
1148 
log(Vector4f & dst,const Vector4f & src,bool pp)1149 	void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp)
1150 	{
1151 		dst.x = logarithm(src.x, false, pp);
1152 		dst.y = logarithm(src.y, false, pp);
1153 		dst.z = logarithm(src.z, false, pp);
1154 		dst.w = logarithm(src.w, false, pp);
1155 	}
1156 
lit(Vector4f & dst,const Vector4f & src)1157 	void ShaderCore::lit(Vector4f &dst, const Vector4f &src)
1158 	{
1159 		dst.x = Float4(1.0f);
1160 		dst.y = Max(src.x, Float4(0.0f));
1161 
1162 		Float4 pow;
1163 
1164 		pow = src.w;
1165 		pow = Min(pow, Float4(127.9961f));
1166 		pow = Max(pow, Float4(-127.9961f));
1167 
1168 		dst.z = power(src.y, pow);
1169 		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f)));
1170 		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f)));
1171 
1172 		dst.w = Float4(1.0f);
1173 	}
1174 
att(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1175 	void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1176 	{
1177 		// Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d
1178 		dst.x = 1;
1179 		dst.y = src0.y * src1.y;
1180 		dst.z = src0.z;
1181 		dst.w = src1.w;
1182 	}
1183 
lrp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1184 	void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1185 	{
1186 		dst.x = src0.x * (src1.x - src2.x) + src2.x;
1187 		dst.y = src0.y * (src1.y - src2.y) + src2.y;
1188 		dst.z = src0.z * (src1.z - src2.z) + src2.z;
1189 		dst.w = src0.w * (src1.w - src2.w) + src2.w;
1190 	}
1191 
isinf(Vector4f & dst,const Vector4f & src)1192 	void ShaderCore::isinf(Vector4f &dst, const Vector4f &src)
1193 	{
1194 		dst.x = As<Float4>(IsInf(src.x));
1195 		dst.y = As<Float4>(IsInf(src.y));
1196 		dst.z = As<Float4>(IsInf(src.z));
1197 		dst.w = As<Float4>(IsInf(src.w));
1198 	}
1199 
isnan(Vector4f & dst,const Vector4f & src)1200 	void ShaderCore::isnan(Vector4f &dst, const Vector4f &src)
1201 	{
1202 		dst.x = As<Float4>(IsNan(src.x));
1203 		dst.y = As<Float4>(IsNan(src.y));
1204 		dst.z = As<Float4>(IsNan(src.z));
1205 		dst.w = As<Float4>(IsNan(src.w));
1206 	}
1207 
smooth(Vector4f & dst,const Vector4f & edge0,const Vector4f & edge1,const Vector4f & x)1208 	void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x)
1209 	{
1210 		Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx);
1211 		Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty);
1212 		Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz);
1213 		Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
1214 	}
1215 
floatToHalfBits(Float4 & dst,const Float4 & floatBits,bool storeInUpperBits)1216 	void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits)
1217 	{
1218 		static const uint32_t mask_sign = 0x80000000u;
1219 		static const uint32_t mask_round = ~0xfffu;
1220 		static const uint32_t c_f32infty = 255 << 23;
1221 		static const uint32_t c_magic = 15 << 23;
1222 		static const uint32_t c_nanbit = 0x200;
1223 		static const uint32_t c_infty_as_fp16 = 0x7c00;
1224 		static const uint32_t c_clamp = (31 << 23) - 0x1000;
1225 
1226 		UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits);
1227 		UInt4 absf = As<UInt4>(floatBits) ^ justsign;
1228 		UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf);
1229 
1230 		// Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
1231 		//       instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
1232 		UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)),
1233 		                                 As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) |
1234 		               ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) |
1235 		               UInt4(c_infty_as_fp16)));
1236 
1237 		dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16));
1238 	}
1239 
halfToFloatBits(Float4 & dst,const Float4 & halfBits)1240 	void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits)
1241 	{
1242 		static const uint32_t mask_nosign = 0x7FFF;
1243 		static const uint32_t magic = (254 - 15) << 23;
1244 		static const uint32_t was_infnan = 0x7BFF;
1245 		static const uint32_t exp_infnan = 255 << 23;
1246 
1247 		UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign);
1248 		dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
1249 		                 ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) |
1250 		                 (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan)));
1251 	}
1252 
packHalf2x16(Vector4f & d,const Vector4f & s0)1253 	void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0)
1254 	{
1255 		// half2 | half1
1256 		floatToHalfBits(d.x, s0.x, false);
1257 		floatToHalfBits(d.x, s0.y, true);
1258 	}
1259 
unpackHalf2x16(Vector4f & dst,const Vector4f & s0)1260 	void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0)
1261 	{
1262 		// half2 | half1
1263 		halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF)));
1264 		halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16));
1265 	}
1266 
packSnorm2x16(Vector4f & d,const Vector4f & s0)1267 	void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0)
1268 	{
1269 		// round(clamp(c, -1.0, 1.0) * 32767.0)
1270 		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) |
1271 		                ((Int4(Round(Min(Max(s0.y, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) << 16));
1272 	}
1273 
packUnorm2x16(Vector4f & d,const Vector4f & s0)1274 	void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0)
1275 	{
1276 		// round(clamp(c, 0.0, 1.0) * 65535.0)
1277 		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) |
1278 		                ((Int4(Round(Min(Max(s0.y, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) << 16));
1279 	}
1280 
unpackSnorm2x16(Vector4f & dst,const Vector4f & s0)1281 	void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0)
1282 	{
1283 		// clamp(f / 32727.0, -1.0, 1.0)
1284 		dst.x = Min(Max(Float4(As<Int4>((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16)) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
1285 		dst.y = Min(Max(Float4(As<Int4>(As<UInt4>(s0.x) & UInt4(0xFFFF0000))) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
1286 	}
1287 
unpackUnorm2x16(Vector4f & dst,const Vector4f & s0)1288 	void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0)
1289 	{
1290 		// f / 65535.0
1291 		dst.x = Float4((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16) * Float4(1.0f / float(0xFFFF0000));
1292 		dst.y = Float4(As<UInt4>(s0.x) & UInt4(0xFFFF0000)) * Float4(1.0f / float(0xFFFF0000));
1293 	}
1294 
det2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1295 	void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1296 	{
1297 		dst.x = src0.x * src1.y - src0.y * src1.x;
1298 		dst.y = dst.z = dst.w = dst.x;
1299 	}
1300 
det3(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1301 	void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1302 	{
1303 		crs(dst, src1, src2);
1304 		dp3(dst, dst, src0);
1305 	}
1306 
det4(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2,const Vector4f & src3)1307 	void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3)
1308 	{
1309 		dst.x = src2.z * src3.w - src2.w * src3.z;
1310 		dst.y = src1.w * src3.z - src1.z * src3.w;
1311 		dst.z = src1.z * src2.w - src1.w * src2.z;
1312 		dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) -
1313 		        src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) +
1314 		        src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) +
1315 		                  src2.x * (src1.w * src3.y - src1.y * src3.w) +
1316 		                  src3.x * (src1.y * src2.w - src1.w * src2.y)) +
1317 		        src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) +
1318 		                  src2.x * (src1.y * src3.z - src1.z * src3.y) +
1319 		                  src3.x * (src1.z * src2.y - src1.y * src2.z));
1320 		dst.y = dst.z = dst.w = dst.x;
1321 	}
1322 
frc(Vector4f & dst,const Vector4f & src)1323 	void ShaderCore::frc(Vector4f &dst, const Vector4f &src)
1324 	{
1325 		dst.x = Frac(src.x);
1326 		dst.y = Frac(src.y);
1327 		dst.z = Frac(src.z);
1328 		dst.w = Frac(src.w);
1329 	}
1330 
trunc(Vector4f & dst,const Vector4f & src)1331 	void ShaderCore::trunc(Vector4f &dst, const Vector4f &src)
1332 	{
1333 		dst.x = Trunc(src.x);
1334 		dst.y = Trunc(src.y);
1335 		dst.z = Trunc(src.z);
1336 		dst.w = Trunc(src.w);
1337 	}
1338 
floor(Vector4f & dst,const Vector4f & src)1339 	void ShaderCore::floor(Vector4f &dst, const Vector4f &src)
1340 	{
1341 		dst.x = Floor(src.x);
1342 		dst.y = Floor(src.y);
1343 		dst.z = Floor(src.z);
1344 		dst.w = Floor(src.w);
1345 	}
1346 
round(Vector4f & dst,const Vector4f & src)1347 	void ShaderCore::round(Vector4f &dst, const Vector4f &src)
1348 	{
1349 		dst.x = Round(src.x);
1350 		dst.y = Round(src.y);
1351 		dst.z = Round(src.z);
1352 		dst.w = Round(src.w);
1353 	}
1354 
roundEven(Vector4f & dst,const Vector4f & src)1355 	void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src)
1356 	{
1357 		// dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src));
1358 		// ex.: 1.5:  2 + (0 * 2 - 1) * 1 * 0 = 2
1359 		//      2.5:  3 + (0 * 2 - 1) * 1 * 1 = 2
1360 		//     -1.5: -2 + (1 * 2 - 1) * 1 * 0 = -2
1361 		//     -2.5: -3 + (1 * 2 - 1) * 1 * 1 = -2
1362 		// Even if the round implementation rounds the other way:
1363 		//      1.5:  1 + (1 * 2 - 1) * 1 * 1 = 2
1364 		//      2.5:  2 + (1 * 2 - 1) * 1 * 0 = 2
1365 		//     -1.5: -1 + (0 * 2 - 1) * 1 * 1 = -2
1366 		//     -2.5: -2 + (0 * 2 - 1) * 1 * 0 = -2
1367 		round(dst, src);
1368 		dst.x += ((Float4(CmpLT(dst.x, src.x) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.x), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.x) & Int4(1));
1369 		dst.y += ((Float4(CmpLT(dst.y, src.y) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.y), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.y) & Int4(1));
1370 		dst.z += ((Float4(CmpLT(dst.z, src.z) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.z), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.z) & Int4(1));
1371 		dst.w += ((Float4(CmpLT(dst.w, src.w) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.w), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.w) & Int4(1));
1372 	}
1373 
ceil(Vector4f & dst,const Vector4f & src)1374 	void ShaderCore::ceil(Vector4f &dst, const Vector4f &src)
1375 	{
1376 		dst.x = Ceil(src.x);
1377 		dst.y = Ceil(src.y);
1378 		dst.z = Ceil(src.z);
1379 		dst.w = Ceil(src.w);
1380 	}
1381 
powx(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1382 	void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
1383 	{
1384 		Float4 pow = power(src0.x, src1.x, pp);
1385 
1386 		dst.x = pow;
1387 		dst.y = pow;
1388 		dst.z = pow;
1389 		dst.w = pow;
1390 	}
1391 
pow(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1392 	void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
1393 	{
1394 		dst.x = power(src0.x, src1.x, pp);
1395 		dst.y = power(src0.y, src1.y, pp);
1396 		dst.z = power(src0.z, src1.z, pp);
1397 		dst.w = power(src0.w, src1.w, pp);
1398 	}
1399 
crs(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1400 	void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1401 	{
1402 		dst.x = src0.y * src1.z - src0.z * src1.y;
1403 		dst.y = src0.z * src1.x - src0.x * src1.z;
1404 		dst.z = src0.x * src1.y - src0.y * src1.x;
1405 	}
1406 
forward1(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1407 	void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1408 	{
1409 		Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000);
1410 
1411 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
1412 	}
1413 
forward2(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1414 	void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1415 	{
1416 		Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000);
1417 
1418 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
1419 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
1420 	}
1421 
forward3(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1422 	void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1423 	{
1424 		Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000);
1425 
1426 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
1427 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
1428 		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
1429 	}
1430 
forward4(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1431 	void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1432 	{
1433 		Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000);
1434 
1435 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
1436 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
1437 		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
1438 		dst.w =  As<Float4>(flip ^ As<Int4>(N.w));
1439 	}
1440 
reflect1(Vector4f & dst,const Vector4f & I,const Vector4f & N)1441 	void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1442 	{
1443 		Float4 d = N.x * I.x;
1444 
1445 		dst.x = I.x - Float4(2.0f) * d * N.x;
1446 	}
1447 
reflect2(Vector4f & dst,const Vector4f & I,const Vector4f & N)1448 	void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1449 	{
1450 		Float4 d = dot2(N, I);
1451 
1452 		dst.x = I.x - Float4(2.0f) * d * N.x;
1453 		dst.y = I.y - Float4(2.0f) * d * N.y;
1454 	}
1455 
reflect3(Vector4f & dst,const Vector4f & I,const Vector4f & N)1456 	void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1457 	{
1458 		Float4 d = dot3(N, I);
1459 
1460 		dst.x = I.x - Float4(2.0f) * d * N.x;
1461 		dst.y = I.y - Float4(2.0f) * d * N.y;
1462 		dst.z = I.z - Float4(2.0f) * d * N.z;
1463 	}
1464 
reflect4(Vector4f & dst,const Vector4f & I,const Vector4f & N)1465 	void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1466 	{
1467 		Float4 d = dot4(N, I);
1468 
1469 		dst.x = I.x - Float4(2.0f) * d * N.x;
1470 		dst.y = I.y - Float4(2.0f) * d * N.y;
1471 		dst.z = I.z - Float4(2.0f) * d * N.z;
1472 		dst.w = I.w - Float4(2.0f) * d * N.w;
1473 	}
1474 
refract1(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1475 	void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1476 	{
1477 		Float4 d = N.x * I.x;
1478 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
1479 		Int4 pos = CmpNLT(k, Float4(0.0f));
1480 		Float4 t = (eta * d + Sqrt(k));
1481 
1482 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1483 	}
1484 
refract2(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1485 	void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1486 	{
1487 		Float4 d = dot2(N, I);
1488 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
1489 		Int4 pos = CmpNLT(k, Float4(0.0f));
1490 		Float4 t = (eta * d + Sqrt(k));
1491 
1492 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1493 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
1494 	}
1495 
refract3(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1496 	void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1497 	{
1498 		Float4 d = dot3(N, I);
1499 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
1500 		Int4 pos = CmpNLT(k, Float4(0.0f));
1501 		Float4 t = (eta * d + Sqrt(k));
1502 
1503 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1504 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
1505 		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
1506 	}
1507 
refract4(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1508 	void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1509 	{
1510 		Float4 d = dot4(N, I);
1511 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
1512 		Int4 pos = CmpNLT(k, Float4(0.0f));
1513 		Float4 t = (eta * d + Sqrt(k));
1514 
1515 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1516 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
1517 		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
1518 		dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w));
1519 	}
1520 
sgn(Vector4f & dst,const Vector4f & src)1521 	void ShaderCore::sgn(Vector4f &dst, const Vector4f &src)
1522 	{
1523 		sgn(dst.x, src.x);
1524 		sgn(dst.y, src.y);
1525 		sgn(dst.z, src.z);
1526 		sgn(dst.w, src.w);
1527 	}
1528 
isgn(Vector4f & dst,const Vector4f & src)1529 	void ShaderCore::isgn(Vector4f &dst, const Vector4f &src)
1530 	{
1531 		isgn(dst.x, src.x);
1532 		isgn(dst.y, src.y);
1533 		isgn(dst.z, src.z);
1534 		isgn(dst.w, src.w);
1535 	}
1536 
abs(Vector4f & dst,const Vector4f & src)1537 	void ShaderCore::abs(Vector4f &dst, const Vector4f &src)
1538 	{
1539 		dst.x = Abs(src.x);
1540 		dst.y = Abs(src.y);
1541 		dst.z = Abs(src.z);
1542 		dst.w = Abs(src.w);
1543 	}
1544 
iabs(Vector4f & dst,const Vector4f & src)1545 	void ShaderCore::iabs(Vector4f &dst, const Vector4f &src)
1546 	{
1547 		dst.x = As<Float4>(Abs(As<Int4>(src.x)));
1548 		dst.y = As<Float4>(Abs(As<Int4>(src.y)));
1549 		dst.z = As<Float4>(Abs(As<Int4>(src.z)));
1550 		dst.w = As<Float4>(Abs(As<Int4>(src.w)));
1551 	}
1552 
nrm2(Vector4f & dst,const Vector4f & src,bool pp)1553 	void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp)
1554 	{
1555 		Float4 dot = dot2(src, src);
1556 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
1557 
1558 		dst.x = src.x * rsq;
1559 		dst.y = src.y * rsq;
1560 		dst.z = src.z * rsq;
1561 		dst.w = src.w * rsq;
1562 	}
1563 
nrm3(Vector4f & dst,const Vector4f & src,bool pp)1564 	void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp)
1565 	{
1566 		Float4 dot = dot3(src, src);
1567 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
1568 
1569 		dst.x = src.x * rsq;
1570 		dst.y = src.y * rsq;
1571 		dst.z = src.z * rsq;
1572 		dst.w = src.w * rsq;
1573 	}
1574 
nrm4(Vector4f & dst,const Vector4f & src,bool pp)1575 	void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp)
1576 	{
1577 		Float4 dot = dot4(src, src);
1578 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
1579 
1580 		dst.x = src.x * rsq;
1581 		dst.y = src.y * rsq;
1582 		dst.z = src.z * rsq;
1583 		dst.w = src.w * rsq;
1584 	}
1585 
sincos(Vector4f & dst,const Vector4f & src,bool pp)1586 	void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp)
1587 	{
1588 		dst.x = cosine_pi(src.x, pp);
1589 		dst.y = sine_pi(src.x, pp);
1590 	}
1591 
cos(Vector4f & dst,const Vector4f & src,bool pp)1592 	void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp)
1593 	{
1594 		dst.x = cosine(src.x, pp);
1595 		dst.y = cosine(src.y, pp);
1596 		dst.z = cosine(src.z, pp);
1597 		dst.w = cosine(src.w, pp);
1598 	}
1599 
sin(Vector4f & dst,const Vector4f & src,bool pp)1600 	void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp)
1601 	{
1602 		dst.x = sine(src.x, pp);
1603 		dst.y = sine(src.y, pp);
1604 		dst.z = sine(src.z, pp);
1605 		dst.w = sine(src.w, pp);
1606 	}
1607 
tan(Vector4f & dst,const Vector4f & src,bool pp)1608 	void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp)
1609 	{
1610 		dst.x = tangent(src.x, pp);
1611 		dst.y = tangent(src.y, pp);
1612 		dst.z = tangent(src.z, pp);
1613 		dst.w = tangent(src.w, pp);
1614 	}
1615 
acos(Vector4f & dst,const Vector4f & src,bool pp)1616 	void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp)
1617 	{
1618 		dst.x = arccos(src.x, pp);
1619 		dst.y = arccos(src.y, pp);
1620 		dst.z = arccos(src.z, pp);
1621 		dst.w = arccos(src.w, pp);
1622 	}
1623 
asin(Vector4f & dst,const Vector4f & src,bool pp)1624 	void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp)
1625 	{
1626 		dst.x = arcsin(src.x, pp);
1627 		dst.y = arcsin(src.y, pp);
1628 		dst.z = arcsin(src.z, pp);
1629 		dst.w = arcsin(src.w, pp);
1630 	}
1631 
atan(Vector4f & dst,const Vector4f & src,bool pp)1632 	void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp)
1633 	{
1634 		dst.x = arctan(src.x, pp);
1635 		dst.y = arctan(src.y, pp);
1636 		dst.z = arctan(src.z, pp);
1637 		dst.w = arctan(src.w, pp);
1638 	}
1639 
atan2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1640 	void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
1641 	{
1642 		dst.x = arctan(src0.x, src1.x, pp);
1643 		dst.y = arctan(src0.y, src1.y, pp);
1644 		dst.z = arctan(src0.z, src1.z, pp);
1645 		dst.w = arctan(src0.w, src1.w, pp);
1646 	}
1647 
cosh(Vector4f & dst,const Vector4f & src,bool pp)1648 	void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp)
1649 	{
1650 		dst.x = cosineh(src.x, pp);
1651 		dst.y = cosineh(src.y, pp);
1652 		dst.z = cosineh(src.z, pp);
1653 		dst.w = cosineh(src.w, pp);
1654 	}
1655 
sinh(Vector4f & dst,const Vector4f & src,bool pp)1656 	void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp)
1657 	{
1658 		dst.x = sineh(src.x, pp);
1659 		dst.y = sineh(src.y, pp);
1660 		dst.z = sineh(src.z, pp);
1661 		dst.w = sineh(src.w, pp);
1662 	}
1663 
tanh(Vector4f & dst,const Vector4f & src,bool pp)1664 	void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp)
1665 	{
1666 		dst.x = tangenth(src.x, pp);
1667 		dst.y = tangenth(src.y, pp);
1668 		dst.z = tangenth(src.z, pp);
1669 		dst.w = tangenth(src.w, pp);
1670 	}
1671 
acosh(Vector4f & dst,const Vector4f & src,bool pp)1672 	void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp)
1673 	{
1674 		dst.x = arccosh(src.x, pp);
1675 		dst.y = arccosh(src.y, pp);
1676 		dst.z = arccosh(src.z, pp);
1677 		dst.w = arccosh(src.w, pp);
1678 	}
1679 
asinh(Vector4f & dst,const Vector4f & src,bool pp)1680 	void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp)
1681 	{
1682 		dst.x = arcsinh(src.x, pp);
1683 		dst.y = arcsinh(src.y, pp);
1684 		dst.z = arcsinh(src.z, pp);
1685 		dst.w = arcsinh(src.w, pp);
1686 	}
1687 
atanh(Vector4f & dst,const Vector4f & src,bool pp)1688 	void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp)
1689 	{
1690 		dst.x = arctanh(src.x, pp);
1691 		dst.y = arctanh(src.y, pp);
1692 		dst.z = arctanh(src.z, pp);
1693 		dst.w = arctanh(src.w, pp);
1694 	}
1695 
expp(Vector4f & dst,const Vector4f & src,unsigned short shaderModel)1696 	void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
1697 	{
1698 		if(shaderModel < 0x0200)
1699 		{
1700 			Float4 frc = Frac(src.x);
1701 			Float4 floor = src.x - frc;
1702 
1703 			dst.x = exponential2(floor, true);
1704 			dst.y = frc;
1705 			dst.z = exponential2(src.x, true);
1706 			dst.w = Float4(1.0f);
1707 		}
1708 		else   // Version >= 2.0
1709 		{
1710 			exp2x(dst, src, true);   // FIXME: 10-bit precision suffices
1711 		}
1712 	}
1713 
logp(Vector4f & dst,const Vector4f & src,unsigned short shaderModel)1714 	void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
1715 	{
1716 		if(shaderModel < 0x0200)
1717 		{
1718 			Float4 tmp0;
1719 			Float4 tmp1;
1720 			Float4 t;
1721 			Int4 r;
1722 
1723 			tmp0 = Abs(src.x);
1724 			tmp1 = tmp0;
1725 
1726 			// X component
1727 			r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127);
1728 			dst.x = Float4(r);
1729 
1730 			// Y component
1731 			dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
1732 
1733 			// Z component
1734 			dst.z = logarithm2(src.x, true, true);
1735 
1736 			// W component
1737 			dst.w = 1.0f;
1738 		}
1739 		else
1740 		{
1741 			log2x(dst, src, true);
1742 		}
1743 	}
1744 
cmp0(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1745 	void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1746 	{
1747 		cmp0(dst.x, src0.x, src1.x, src2.x);
1748 		cmp0(dst.y, src0.y, src1.y, src2.y);
1749 		cmp0(dst.z, src0.z, src1.z, src2.z);
1750 		cmp0(dst.w, src0.w, src1.w, src2.w);
1751 	}
1752 
select(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1753 	void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1754 	{
1755 		select(dst.x, As<Int4>(src0.x), src1.x, src2.x);
1756 		select(dst.y, As<Int4>(src0.y), src1.y, src2.y);
1757 		select(dst.z, As<Int4>(src0.z), src1.z, src2.z);
1758 		select(dst.w, As<Int4>(src0.w), src1.w, src2.w);
1759 	}
1760 
extract(Float4 & dst,const Vector4f & src0,const Float4 & src1)1761 	void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1)
1762 	{
1763 		select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x);
1764 		select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst);
1765 		select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst);
1766 	}
1767 
insert(Vector4f & dst,const Vector4f & src,const Float4 & element,const Float4 & index)1768 	void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index)
1769 	{
1770 		select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x);
1771 		select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y);
1772 		select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z);
1773 		select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w);
1774 	}
1775 
sgn(Float4 & dst,const Float4 & src)1776 	void ShaderCore::sgn(Float4 &dst, const Float4 &src)
1777 	{
1778 		Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f));
1779 		Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f));
1780 		dst = As<Float4>(neg | pos);
1781 	}
1782 
isgn(Float4 & dst,const Float4 & src)1783 	void ShaderCore::isgn(Float4 &dst, const Float4 &src)
1784 	{
1785 		Int4 neg = CmpLT(As<Int4>(src), Int4(0)) & Int4(-1);
1786 		Int4 pos = CmpNLE(As<Int4>(src), Int4(0)) & Int4(1);
1787 		dst = As<Float4>(neg | pos);
1788 	}
1789 
cmp0(Float4 & dst,const Float4 & src0,const Float4 & src1,const Float4 & src2)1790 	void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
1791 	{
1792 		Int4 pos = CmpLE(Float4(0.0f), src0);
1793 		select(dst, pos, src1, src2);
1794 	}
1795 
cmp0i(Float4 & dst,const Float4 & src0,const Float4 & src1,const Float4 & src2)1796 	void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
1797 	{
1798 		Int4 pos = CmpEQ(Int4(0), As<Int4>(src0));
1799 		select(dst, pos, src1, src2);
1800 	}
1801 
select(Float4 & dst,RValue<Int4> src0,const Float4 & src1,const Float4 & src2)1802 	void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2)
1803 	{
1804 		// FIXME: LLVM vector select
1805 		dst = As<Float4>((src0 & As<Int4>(src1)) | (~src0 & As<Int4>(src2)));
1806 	}
1807 
cmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1808 	void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
1809 	{
1810 		switch(control)
1811 		{
1812 		case Shader::CONTROL_GT:
1813 			dst.x = As<Float4>(CmpNLE(src0.x, src1.x));
1814 			dst.y = As<Float4>(CmpNLE(src0.y, src1.y));
1815 			dst.z = As<Float4>(CmpNLE(src0.z, src1.z));
1816 			dst.w = As<Float4>(CmpNLE(src0.w, src1.w));
1817 			break;
1818 		case Shader::CONTROL_EQ:
1819 			dst.x = As<Float4>(CmpEQ(src0.x, src1.x));
1820 			dst.y = As<Float4>(CmpEQ(src0.y, src1.y));
1821 			dst.z = As<Float4>(CmpEQ(src0.z, src1.z));
1822 			dst.w = As<Float4>(CmpEQ(src0.w, src1.w));
1823 			break;
1824 		case Shader::CONTROL_GE:
1825 			dst.x = As<Float4>(CmpNLT(src0.x, src1.x));
1826 			dst.y = As<Float4>(CmpNLT(src0.y, src1.y));
1827 			dst.z = As<Float4>(CmpNLT(src0.z, src1.z));
1828 			dst.w = As<Float4>(CmpNLT(src0.w, src1.w));
1829 			break;
1830 		case Shader::CONTROL_LT:
1831 			dst.x = As<Float4>(CmpLT(src0.x, src1.x));
1832 			dst.y = As<Float4>(CmpLT(src0.y, src1.y));
1833 			dst.z = As<Float4>(CmpLT(src0.z, src1.z));
1834 			dst.w = As<Float4>(CmpLT(src0.w, src1.w));
1835 			break;
1836 		case Shader::CONTROL_NE:
1837 			dst.x = As<Float4>(CmpNEQ(src0.x, src1.x));
1838 			dst.y = As<Float4>(CmpNEQ(src0.y, src1.y));
1839 			dst.z = As<Float4>(CmpNEQ(src0.z, src1.z));
1840 			dst.w = As<Float4>(CmpNEQ(src0.w, src1.w));
1841 			break;
1842 		case Shader::CONTROL_LE:
1843 			dst.x = As<Float4>(CmpLE(src0.x, src1.x));
1844 			dst.y = As<Float4>(CmpLE(src0.y, src1.y));
1845 			dst.z = As<Float4>(CmpLE(src0.z, src1.z));
1846 			dst.w = As<Float4>(CmpLE(src0.w, src1.w));
1847 			break;
1848 		default:
1849 			ASSERT(false);
1850 		}
1851 	}
1852 
icmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1853 	void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
1854 	{
1855 		switch(control)
1856 		{
1857 		case Shader::CONTROL_GT:
1858 			dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x)));
1859 			dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y)));
1860 			dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z)));
1861 			dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w)));
1862 			break;
1863 		case Shader::CONTROL_EQ:
1864 			dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
1865 			dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
1866 			dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
1867 			dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
1868 			break;
1869 		case Shader::CONTROL_GE:
1870 			dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x)));
1871 			dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y)));
1872 			dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z)));
1873 			dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w)));
1874 			break;
1875 		case Shader::CONTROL_LT:
1876 			dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x)));
1877 			dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y)));
1878 			dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z)));
1879 			dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w)));
1880 			break;
1881 		case Shader::CONTROL_NE:
1882 			dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
1883 			dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
1884 			dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
1885 			dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
1886 			break;
1887 		case Shader::CONTROL_LE:
1888 			dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x)));
1889 			dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y)));
1890 			dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z)));
1891 			dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w)));
1892 			break;
1893 		default:
1894 			ASSERT(false);
1895 		}
1896 	}
1897 
ucmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1898 	void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
1899 	{
1900 		switch(control)
1901 		{
1902 		case Shader::CONTROL_GT:
1903 			dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1904 			dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1905 			dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1906 			dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1907 			break;
1908 		case Shader::CONTROL_EQ:
1909 			dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1910 			dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1911 			dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1912 			dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1913 			break;
1914 		case Shader::CONTROL_GE:
1915 			dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1916 			dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1917 			dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1918 			dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1919 			break;
1920 		case Shader::CONTROL_LT:
1921 			dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1922 			dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1923 			dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1924 			dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1925 			break;
1926 		case Shader::CONTROL_NE:
1927 			dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1928 			dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1929 			dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1930 			dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1931 			break;
1932 		case Shader::CONTROL_LE:
1933 			dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1934 			dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1935 			dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1936 			dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1937 			break;
1938 		default:
1939 			ASSERT(false);
1940 		}
1941 	}
1942 
all(Float4 & dst,const Vector4f & src)1943 	void ShaderCore::all(Float4 &dst, const Vector4f &src)
1944 	{
1945 		dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w));
1946 	}
1947 
any(Float4 & dst,const Vector4f & src)1948 	void ShaderCore::any(Float4 &dst, const Vector4f &src)
1949 	{
1950 		dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w));
1951 	}
1952 
bitwise_not(Vector4f & dst,const Vector4f & src)1953 	void ShaderCore::bitwise_not(Vector4f &dst, const Vector4f &src)
1954 	{
1955 		dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF));
1956 		dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF));
1957 		dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF));
1958 		dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF));
1959 	}
1960 
bitwise_or(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1961 	void ShaderCore::bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1962 	{
1963 		dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x));
1964 		dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y));
1965 		dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z));
1966 		dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w));
1967 	}
1968 
bitwise_xor(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1969 	void ShaderCore::bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1970 	{
1971 		dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x));
1972 		dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y));
1973 		dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z));
1974 		dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w));
1975 	}
1976 
bitwise_and(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1977 	void ShaderCore::bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1978 	{
1979 		dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x));
1980 		dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y));
1981 		dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z));
1982 		dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w));
1983 	}
1984 
equal(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1985 	void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1986 	{
1987 		dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) &
1988 		                   CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) &
1989 		                   CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) &
1990 		                   CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1991 		dst.y = dst.x;
1992 		dst.z = dst.x;
1993 		dst.w = dst.x;
1994 	}
1995 
notEqual(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1996 	void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1997 	{
1998 		dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) |
1999 		                   CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) |
2000 		                   CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) |
2001 		                   CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
2002 		dst.y = dst.x;
2003 		dst.z = dst.x;
2004 		dst.w = dst.x;
2005 	}
2006 }
2007