1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "ShaderCore.hpp"
16 
17 #include "Renderer/Renderer.hpp"
18 #include "Common/Debug.hpp"
19 
20 #include <limits.h>
21 
22 namespace sw
23 {
24 	extern TranscendentalPrecision logPrecision;
25 	extern TranscendentalPrecision expPrecision;
26 	extern TranscendentalPrecision rcpPrecision;
27 	extern TranscendentalPrecision rsqPrecision;
28 
Vector4s()29 	Vector4s::Vector4s()
30 	{
31 	}
32 
Vector4s(unsigned short x,unsigned short y,unsigned short z,unsigned short w)33 	Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
34 	{
35 		this->x = Short4(x);
36 		this->y = Short4(y);
37 		this->z = Short4(z);
38 		this->w = Short4(w);
39 	}
40 
Vector4s(const Vector4s & rhs)41 	Vector4s::Vector4s(const Vector4s &rhs)
42 	{
43 		x = rhs.x;
44 		y = rhs.y;
45 		z = rhs.z;
46 		w = rhs.w;
47 	}
48 
operator =(const Vector4s & rhs)49 	Vector4s &Vector4s::operator=(const Vector4s &rhs)
50 	{
51 		x = rhs.x;
52 		y = rhs.y;
53 		z = rhs.z;
54 		w = rhs.w;
55 
56 		return *this;
57 	}
58 
operator [](int i)59 	Short4 &Vector4s::operator[](int i)
60 	{
61 		switch(i)
62 		{
63 		case 0: return x;
64 		case 1: return y;
65 		case 2: return z;
66 		case 3: return w;
67 		}
68 
69 		return x;
70 	}
71 
Vector4i()72 	Vector4i::Vector4i()
73 	{
74 	}
75 
Vector4i(int x,int y,int z,int w)76 	Vector4i::Vector4i(int x, int y, int z, int w)
77 	{
78 		this->x = Int4(x);
79 		this->y = Int4(y);
80 		this->z = Int4(z);
81 		this->w = Int4(w);
82 	}
83 
Vector4i(const Vector4i & rhs)84 	Vector4i::Vector4i(const Vector4i &rhs)
85 	{
86 		x = rhs.x;
87 		y = rhs.y;
88 		z = rhs.z;
89 		w = rhs.w;
90 	}
91 
operator =(const Vector4i & rhs)92 	Vector4i &Vector4i::operator=(const Vector4i &rhs)
93 	{
94 		x = rhs.x;
95 		y = rhs.y;
96 		z = rhs.z;
97 		w = rhs.w;
98 
99 		return *this;
100 	}
101 
operator [](int i)102 	Int4 &Vector4i::operator[](int i)
103 	{
104 		switch(i)
105 		{
106 		case 0: return x;
107 		case 1: return y;
108 		case 2: return z;
109 		case 3: return w;
110 		}
111 
112 		return x;
113 	}
114 
Vector4u()115 	Vector4u::Vector4u()
116 	{
117 	}
118 
Vector4u(unsigned int x,unsigned int y,unsigned int z,unsigned int w)119 	Vector4u::Vector4u(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
120 	{
121 		this->x = UInt4(x);
122 		this->y = UInt4(y);
123 		this->z = UInt4(z);
124 		this->w = UInt4(w);
125 	}
126 
Vector4u(const Vector4u & rhs)127 	Vector4u::Vector4u(const Vector4u &rhs)
128 	{
129 		x = rhs.x;
130 		y = rhs.y;
131 		z = rhs.z;
132 		w = rhs.w;
133 	}
134 
operator =(const Vector4u & rhs)135 	Vector4u &Vector4u::operator=(const Vector4u &rhs)
136 	{
137 		x = rhs.x;
138 		y = rhs.y;
139 		z = rhs.z;
140 		w = rhs.w;
141 
142 		return *this;
143 	}
144 
operator [](int i)145 	UInt4 &Vector4u::operator[](int i)
146 	{
147 		switch(i)
148 		{
149 		case 0: return x;
150 		case 1: return y;
151 		case 2: return z;
152 		case 3: return w;
153 		}
154 
155 		return x;
156 	}
157 
Vector4f()158 	Vector4f::Vector4f()
159 	{
160 	}
161 
Vector4f(float x,float y,float z,float w)162 	Vector4f::Vector4f(float x, float y, float z, float w)
163 	{
164 		this->x = Float4(x);
165 		this->y = Float4(y);
166 		this->z = Float4(z);
167 		this->w = Float4(w);
168 	}
169 
Vector4f(const Vector4f & rhs)170 	Vector4f::Vector4f(const Vector4f &rhs)
171 	{
172 		x = rhs.x;
173 		y = rhs.y;
174 		z = rhs.z;
175 		w = rhs.w;
176 	}
177 
operator =(const Vector4f & rhs)178 	Vector4f &Vector4f::operator=(const Vector4f &rhs)
179 	{
180 		x = rhs.x;
181 		y = rhs.y;
182 		z = rhs.z;
183 		w = rhs.w;
184 
185 		return *this;
186 	}
187 
operator [](int i)188 	Float4 &Vector4f::operator[](int i)
189 	{
190 		switch(i)
191 		{
192 		case 0: return x;
193 		case 1: return y;
194 		case 2: return z;
195 		case 3: return w;
196 		}
197 
198 		return x;
199 	}
200 
exponential2(RValue<Float4> x,bool pp)201 	Float4 exponential2(RValue<Float4> x, bool pp)
202 	{
203 		Float4 x0;
204 		Float4 x1;
205 		Int4 x2;
206 
207 		x0 = x;
208 
209 		x0 = Min(x0, As<Float4>(Int4(0x43010000)));   // 129.00000e+0f
210 		x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF)));   // -126.99999e+0f
211 		x1 = x0;
212 		x1 -= Float4(0.5f);
213 		x2 = RoundInt(x1);
214 		x1 = Float4(x2);
215 		x2 += Int4(0x0000007F);   // 127
216 		x2 = x2 << 23;
217 		x0 -= x1;
218 		x1 = As<Float4>(Int4(0x3AF61905));   // 1.8775767e-3f
219 		x1 *= x0;
220 		x1 += As<Float4>(Int4(0x3C134806));   // 8.9893397e-3f
221 		x1 *= x0;
222 		x1 += As<Float4>(Int4(0x3D64AA23));   // 5.5826318e-2f
223 		x1 *= x0;
224 		x1 += As<Float4>(Int4(0x3E75EAD4));   // 2.4015361e-1f
225 		x1 *= x0;
226 		x1 += As<Float4>(Int4(0x3F31727B));   // 6.9315308e-1f
227 		x1 *= x0;
228 		x1 += As<Float4>(Int4(0x3F7FFFFF));   // 9.9999994e-1f
229 		x1 *= As<Float4>(x2);
230 
231 		return x1;
232 	}
233 
logarithm2(RValue<Float4> x,bool absolute,bool pp)234 	Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp)
235 	{
236 		Float4 x0;
237 		Float4 x1;
238 		Float4 x2;
239 		Float4 x3;
240 
241 		x0 = x;
242 
243 		x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
244 		x1 = As<Float4>(As<UInt4>(x1) >> 8);
245 		x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
246 		x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f);   // FIXME: (x1 - 1.4960938f) * 256.0f;
247 		x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
248 
249 		x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
250 		x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
251 		x2 /= x3;
252 
253 		x1 += (x0 - Float4(1.0f)) * x2;
254 
255 		return x1;
256 	}
257 
exponential(RValue<Float4> x,bool pp)258 	Float4 exponential(RValue<Float4> x, bool pp)
259 	{
260 		// FIXME: Propagate the constant
261 		return exponential2(Float4(1.44269541f) * x, pp);   // 1/ln(2)
262 	}
263 
logarithm(RValue<Float4> x,bool absolute,bool pp)264 	Float4 logarithm(RValue<Float4> x, bool absolute, bool pp)
265 	{
266 		// FIXME: Propagate the constant
267 		return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp);   // ln(2)
268 	}
269 
power(RValue<Float4> x,RValue<Float4> y,bool pp)270 	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
271 	{
272 		Float4 log = logarithm2(x, true, pp);
273 		log *= y;
274 		return exponential2(log, pp);
275 	}
276 
reciprocal(RValue<Float4> x,bool pp,bool finite,bool exactAtPow2)277 	Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
278 	{
279 		Float4 rcp;
280 
281 		if(!pp && rcpPrecision >= WHQL)
282 		{
283 			rcp = Float4(1.0f) / x;
284 		}
285 		else
286 		{
287 			rcp = Rcp_pp(x, exactAtPow2);
288 
289 			if(!pp)
290 			{
291 				rcp = (rcp + rcp) - (x * rcp * rcp);
292 			}
293 		}
294 
295 		if(finite)
296 		{
297 			int big = 0x7F7FFFFF;
298 			rcp = Min(rcp, Float4((float&)big));
299 		}
300 
301 		return rcp;
302 	}
303 
reciprocalSquareRoot(RValue<Float4> x,bool absolute,bool pp)304 	Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
305 	{
306 		Float4 abs = x;
307 
308 		if(absolute)
309 		{
310 			abs = Abs(abs);
311 		}
312 
313 		Float4 rsq;
314 
315 		if(!pp && rsqPrecision >= IEEE)
316 		{
317 			rsq = Float4(1.0f) / Sqrt(abs);
318 		}
319 		else
320 		{
321 			rsq = RcpSqrt_pp(abs);
322 
323 			if(!pp)
324 			{
325 				rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
326 			}
327 		}
328 
329 		int big = 0x7F7FFFFF;
330 		rsq = Min(rsq, Float4((float&)big));
331 
332 		return rsq;
333 	}
334 
modulo(RValue<Float4> x,RValue<Float4> y)335 	Float4 modulo(RValue<Float4> x, RValue<Float4> y)
336 	{
337 		return x - y * Floor(x / y);
338 	}
339 
sine_pi(RValue<Float4> x,bool pp)340 	Float4 sine_pi(RValue<Float4> x, bool pp)
341 	{
342 		const Float4 A = Float4(-4.05284734e-1f);   // -4/pi^2
343 		const Float4 B = Float4(1.27323954e+0f);    // 4/pi
344 		const Float4 C = Float4(7.75160950e-1f);
345 		const Float4 D = Float4(2.24839049e-1f);
346 
347 		// Parabola approximating sine
348 		Float4 sin = x * (Abs(x) * A + B);
349 
350 		// Improve precision from 0.06 to 0.001
351 		if(true)
352 		{
353 			sin = sin * (Abs(sin) * D + C);
354 		}
355 
356 		return sin;
357 	}
358 
cosine_pi(RValue<Float4> x,bool pp)359 	Float4 cosine_pi(RValue<Float4> x, bool pp)
360 	{
361 		// cos(x) = sin(x + pi/2)
362 		Float4 y = x + Float4(1.57079632e+0f);
363 
364 		// Wrap around
365 		y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
366 
367 		return sine_pi(y, pp);
368 	}
369 
sine(RValue<Float4> x,bool pp)370 	Float4 sine(RValue<Float4> x, bool pp)
371 	{
372 		// Reduce to [-0.5, 0.5] range
373 		Float4 y = x * Float4(1.59154943e-1f);   // 1/2pi
374 		y = y - Round(y);
375 
376 		const Float4 A = Float4(-16.0f);
377 		const Float4 B = Float4(8.0f);
378 		const Float4 C = Float4(7.75160950e-1f);
379 		const Float4 D = Float4(2.24839049e-1f);
380 
381 		// Parabola approximating sine
382 		Float4 sin = y * (Abs(y) * A + B);
383 
384 		// Improve precision from 0.06 to 0.001
385 		if(true)
386 		{
387 			sin = sin * (Abs(sin) * D + C);
388 		}
389 
390 		return sin;
391 	}
392 
cosine(RValue<Float4> x,bool pp)393 	Float4 cosine(RValue<Float4> x, bool pp)
394 	{
395 		// cos(x) = sin(x + pi/2)
396 		Float4 y = x + Float4(1.57079632e+0f);
397 		return sine(y, pp);
398 	}
399 
tangent(RValue<Float4> x,bool pp)400 	Float4 tangent(RValue<Float4> x, bool pp)
401 	{
402 		return sine(x, pp) / cosine(x, pp);
403 	}
404 
arccos(RValue<Float4> x,bool pp)405 	Float4 arccos(RValue<Float4> x, bool pp)
406 	{
407 		// pi/2 - arcsin(x)
408 		return Float4(1.57079632e+0f) - arcsin(x);
409 	}
410 
arcsin(RValue<Float4> x,bool pp)411 	Float4 arcsin(RValue<Float4> x, bool pp)
412 	{
413 		// x*(pi/2-sqrt(1-x*x)*pi/5)
414 		return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
415 	}
416 
arctan(RValue<Float4> x,bool pp)417 	Float4 arctan(RValue<Float4> x, bool pp)
418 	{
419 		Int4 O = CmpNLT(Abs(x), Float4(1.0f));
420 		Float4 y = As<Float4>(O & As<Int4>(Float4(1.0f) / x) | ~O & As<Int4>(x));   // FIXME: Vector select
421 
422 		// Approximation of atan in [-1..1]
423 		Float4 theta = y * (Float4(-0.27f) * Abs(y) + Float4(1.05539816f));
424 
425 		// +/-pi/2 depending on sign of x
426 		Float4 sgnPi_2 = As<Float4>(As<Int4>(Float4(1.57079632e+0f)) ^ (As<Int4>(x) & Int4(0x80000000)));
427 
428 		theta = As<Float4>(O & As<Int4>(sgnPi_2 - theta) | ~O & As<Int4>(theta));   // FIXME: Vector select
429 
430 		return theta;
431 	}
432 
arctan(RValue<Float4> y,RValue<Float4> x,bool pp)433 	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
434 	{
435 		// Rotate to upper semicircle when in lower semicircle
436 		Int4 S = CmpLT(y, Float4(0.0f));
437 		Float4 theta = As<Float4>(S & As<Int4>(Float4(-3.14159265e+0f)));   // -pi
438 		Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
439 		Float4 y0 = Abs(y);
440 
441 		// Rotate to right quadrant when in left quadrant
442 		Int4 Q = CmpLT(x0, Float4(0.0f));
443 		theta += As<Float4>(Q & As<Int4>(Float4(1.57079632e+0f)));   // pi/2
444 		Float4 x1 = As<Float4>(Q & As<Int4>(y0) | ~Q & As<Int4>(x0));    // FIXME: Vector select
445 		Float4 y1 = As<Float4>(Q & As<Int4>(-x0) | ~Q & As<Int4>(y0));   // FIXME: Vector select
446 
447 		// Rotate to first octant when in second octant
448 		Int4 O = CmpNLT(y1, x1);
449 		theta += As<Float4>(O & As<Int4>(Float4(7.85398163e-1f)));   // pi/4
450 		Float4 x2 = As<Float4>(O & As<Int4>(Float4(7.07106781e-1f) * x1 + Float4(7.07106781e-1f) * y1) | ~O & As<Int4>(x1));   // sqrt(2)/2   // FIXME: Vector select
451 		Float4 y2 = As<Float4>(O & As<Int4>(Float4(7.07106781e-1f) * y1 - Float4(7.07106781e-1f) * x1) | ~O & As<Int4>(y1));   // FIXME: Vector select
452 
453 		// Approximation of atan in [0..1]
454 		Float4 y_x = y2 / x2;
455 		theta += y_x * (Float4(-0.27f) * y_x + Float4(1.05539816f));
456 
457 		return theta;
458 	}
459 
sineh(RValue<Float4> x,bool pp)460 	Float4 sineh(RValue<Float4> x, bool pp)
461 	{
462 		return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f);
463 	}
464 
cosineh(RValue<Float4> x,bool pp)465 	Float4 cosineh(RValue<Float4> x, bool pp)
466 	{
467 		return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f);
468 	}
469 
tangenth(RValue<Float4> x,bool pp)470 	Float4 tangenth(RValue<Float4> x, bool pp)
471 	{
472 		Float4 e_x = exponential(x, pp);
473 		Float4 e_minus_x = exponential(-x, pp);
474 		return (e_x - e_minus_x) / (e_x + e_minus_x);
475 	}
476 
arccosh(RValue<Float4> x,bool pp)477 	Float4 arccosh(RValue<Float4> x, bool pp)
478 	{
479 		return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp);
480 	}
481 
arcsinh(RValue<Float4> x,bool pp)482 	Float4 arcsinh(RValue<Float4> x, bool pp)
483 	{
484 		return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp);
485 	}
486 
arctanh(RValue<Float4> x,bool pp)487 	Float4 arctanh(RValue<Float4> x, bool pp)
488 	{
489 		return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f);
490 	}
491 
dot2(const Vector4f & v0,const Vector4f & v1)492 	Float4 dot2(const Vector4f &v0, const Vector4f &v1)
493 	{
494 		return v0.x * v1.x + v0.y * v1.y;
495 	}
496 
dot3(const Vector4f & v0,const Vector4f & v1)497 	Float4 dot3(const Vector4f &v0, const Vector4f &v1)
498 	{
499 		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
500 	}
501 
dot4(const Vector4f & v0,const Vector4f & v1)502 	Float4 dot4(const Vector4f &v0, const Vector4f &v1)
503 	{
504 		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
505 	}
506 
transpose4x4(Short4 & row0,Short4 & row1,Short4 & row2,Short4 & row3)507 	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
508 	{
509 		Int2 tmp0 = UnpackHigh(row0, row1);
510 		Int2 tmp1 = UnpackHigh(row2, row3);
511 		Int2 tmp2 = UnpackLow(row0, row1);
512 		Int2 tmp3 = UnpackLow(row2, row3);
513 
514 		row0 = As<Short4>(UnpackLow(tmp2, tmp3));
515 		row1 = As<Short4>(UnpackHigh(tmp2, tmp3));
516 		row2 = As<Short4>(UnpackLow(tmp0, tmp1));
517 		row3 = As<Short4>(UnpackHigh(tmp0, tmp1));
518 	}
519 
transpose4x4(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)520 	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
521 	{
522 		Float4 tmp0 = UnpackLow(row0, row1);
523 		Float4 tmp1 = UnpackLow(row2, row3);
524 		Float4 tmp2 = UnpackHigh(row0, row1);
525 		Float4 tmp3 = UnpackHigh(row2, row3);
526 
527 		row0 = Float4(tmp0.xy, tmp1.xy);
528 		row1 = Float4(tmp0.zw, tmp1.zw);
529 		row2 = Float4(tmp2.xy, tmp3.xy);
530 		row3 = Float4(tmp2.zw, tmp3.zw);
531 	}
532 
transpose4x3(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)533 	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
534 	{
535 		Float4 tmp0 = UnpackLow(row0, row1);
536 		Float4 tmp1 = UnpackLow(row2, row3);
537 		Float4 tmp2 = UnpackHigh(row0, row1);
538 		Float4 tmp3 = UnpackHigh(row2, row3);
539 
540 		row0 = Float4(tmp0.xy, tmp1.xy);
541 		row1 = Float4(tmp0.zw, tmp1.zw);
542 		row2 = Float4(tmp2.xy, tmp3.xy);
543 	}
544 
transpose4x2(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)545 	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
546 	{
547 		Float4 tmp0 = UnpackLow(row0, row1);
548 		Float4 tmp1 = UnpackLow(row2, row3);
549 
550 		row0 = Float4(tmp0.xy, tmp1.xy);
551 		row1 = Float4(tmp0.zw, tmp1.zw);
552 	}
553 
transpose4x1(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)554 	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
555 	{
556 		Float4 tmp0 = UnpackLow(row0, row1);
557 		Float4 tmp1 = UnpackLow(row2, row3);
558 
559 		row0 = Float4(tmp0.xy, tmp1.xy);
560 	}
561 
transpose2x4(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)562 	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
563 	{
564 		row0 = UnpackLow(row0, row1);
565 		row1 = Float4(row0.zw, row1.zw);
566 		row2 = UnpackHigh(row0, row1);
567 		row3 = Float4(row2.zw, row3.zw);
568 	}
569 
transpose2x4h(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)570 	void transpose2x4h(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
571 	{
572 		row0 = UnpackLow(row2, row3);
573 		row1 = Float4(row0.zw, row1.zw);
574 		row2 = UnpackHigh(row2, row3);
575 		row3 = Float4(row2.zw, row3.zw);
576 	}
577 
transpose4xN(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3,int N)578 	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
579 	{
580 		switch(N)
581 		{
582 		case 1: transpose4x1(row0, row1, row2, row3); break;
583 		case 2: transpose4x2(row0, row1, row2, row3); break;
584 		case 3: transpose4x3(row0, row1, row2, row3); break;
585 		case 4: transpose4x4(row0, row1, row2, row3); break;
586 		}
587 	}
588 
mov(Vector4f & dst,const Vector4f & src,bool integerDestination)589 	void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination)
590 	{
591 		if(integerDestination)
592 		{
593 			dst.x = As<Float4>(RoundInt(src.x));
594 			dst.y = As<Float4>(RoundInt(src.y));
595 			dst.z = As<Float4>(RoundInt(src.z));
596 			dst.w = As<Float4>(RoundInt(src.w));
597 		}
598 		else
599 		{
600 			dst = src;
601 		}
602 	}
603 
neg(Vector4f & dst,const Vector4f & src)604 	void ShaderCore::neg(Vector4f &dst, const Vector4f &src)
605 	{
606 		dst.x = -src.x;
607 		dst.y = -src.y;
608 		dst.z = -src.z;
609 		dst.w = -src.w;
610 	}
611 
ineg(Vector4f & dst,const Vector4f & src)612 	void ShaderCore::ineg(Vector4f &dst, const Vector4f &src)
613 	{
614 		dst.x = As<Float4>(-As<Int4>(src.x));
615 		dst.y = As<Float4>(-As<Int4>(src.y));
616 		dst.z = As<Float4>(-As<Int4>(src.z));
617 		dst.w = As<Float4>(-As<Int4>(src.w));
618 	}
619 
f2b(Vector4f & dst,const Vector4f & src)620 	void ShaderCore::f2b(Vector4f &dst, const Vector4f &src)
621 	{
622 		dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f)));
623 		dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f)));
624 		dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f)));
625 		dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f)));
626 	}
627 
b2f(Vector4f & dst,const Vector4f & src)628 	void ShaderCore::b2f(Vector4f &dst, const Vector4f &src)
629 	{
630 		dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f)));
631 		dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f)));
632 		dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f)));
633 		dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f)));
634 	}
635 
f2i(Vector4f & dst,const Vector4f & src)636 	void ShaderCore::f2i(Vector4f &dst, const Vector4f &src)
637 	{
638 		dst.x = As<Float4>(Int4(src.x));
639 		dst.y = As<Float4>(Int4(src.y));
640 		dst.z = As<Float4>(Int4(src.z));
641 		dst.w = As<Float4>(Int4(src.w));
642 	}
643 
i2f(Vector4f & dst,const Vector4f & src)644 	void ShaderCore::i2f(Vector4f &dst, const Vector4f &src)
645 	{
646 		dst.x = Float4(As<Int4>(src.x));
647 		dst.y = Float4(As<Int4>(src.y));
648 		dst.z = Float4(As<Int4>(src.z));
649 		dst.w = Float4(As<Int4>(src.w));
650 	}
651 
f2u(Vector4f & dst,const Vector4f & src)652 	void ShaderCore::f2u(Vector4f &dst, const Vector4f &src)
653 	{
654 		dst.x = As<Float4>(UInt4(src.x));
655 		dst.y = As<Float4>(UInt4(src.y));
656 		dst.z = As<Float4>(UInt4(src.z));
657 		dst.w = As<Float4>(UInt4(src.w));
658 	}
659 
u2f(Vector4f & dst,const Vector4f & src)660 	void ShaderCore::u2f(Vector4f &dst, const Vector4f &src)
661 	{
662 		dst.x = Float4(As<UInt4>(src.x));
663 		dst.y = Float4(As<UInt4>(src.y));
664 		dst.z = Float4(As<UInt4>(src.z));
665 		dst.w = Float4(As<UInt4>(src.w));
666 	}
667 
i2b(Vector4f & dst,const Vector4f & src)668 	void ShaderCore::i2b(Vector4f &dst, const Vector4f &src)
669 	{
670 		dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0)));
671 		dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0)));
672 		dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0)));
673 		dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0)));
674 	}
675 
b2i(Vector4f & dst,const Vector4f & src)676 	void ShaderCore::b2i(Vector4f &dst, const Vector4f &src)
677 	{
678 		dst.x = As<Float4>(As<Int4>(src.x) & Int4(1));
679 		dst.y = As<Float4>(As<Int4>(src.y) & Int4(1));
680 		dst.z = As<Float4>(As<Int4>(src.z) & Int4(1));
681 		dst.w = As<Float4>(As<Int4>(src.w) & Int4(1));
682 	}
683 
add(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)684 	void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
685 	{
686 		dst.x = src0.x + src1.x;
687 		dst.y = src0.y + src1.y;
688 		dst.z = src0.z + src1.z;
689 		dst.w = src0.w + src1.w;
690 	}
691 
iadd(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)692 	void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
693 	{
694 		dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x));
695 		dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y));
696 		dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z));
697 		dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w));
698 	}
699 
sub(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)700 	void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
701 	{
702 		dst.x = src0.x - src1.x;
703 		dst.y = src0.y - src1.y;
704 		dst.z = src0.z - src1.z;
705 		dst.w = src0.w - src1.w;
706 	}
707 
isub(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)708 	void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
709 	{
710 		dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x));
711 		dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y));
712 		dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z));
713 		dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w));
714 	}
715 
mad(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)716 	void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
717 	{
718 		dst.x = src0.x * src1.x + src2.x;
719 		dst.y = src0.y * src1.y + src2.y;
720 		dst.z = src0.z * src1.z + src2.z;
721 		dst.w = src0.w * src1.w + src2.w;
722 	}
723 
imad(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)724 	void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
725 	{
726 		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x));
727 		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y));
728 		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z));
729 		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w));
730 	}
731 
mul(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)732 	void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
733 	{
734 		dst.x = src0.x * src1.x;
735 		dst.y = src0.y * src1.y;
736 		dst.z = src0.z * src1.z;
737 		dst.w = src0.w * src1.w;
738 	}
739 
imul(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)740 	void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
741 	{
742 		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x));
743 		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y));
744 		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z));
745 		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w));
746 	}
747 
rcpx(Vector4f & dst,const Vector4f & src,bool pp)748 	void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp)
749 	{
750 		Float4 rcp = reciprocal(src.x, pp, true);
751 
752 		dst.x = rcp;
753 		dst.y = rcp;
754 		dst.z = rcp;
755 		dst.w = rcp;
756 	}
757 
div(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)758 	void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
759 	{
760 		dst.x = src0.x / src1.x;
761 		dst.y = src0.y / src1.y;
762 		dst.z = src0.z / src1.z;
763 		dst.w = src0.w / src1.w;
764 	}
765 
idiv(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)766 	void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
767 	{
768 		Float4 intMax(As<Float4>(Int4(INT_MAX)));
769 		cmp0i(dst.x, src1.x, intMax, src1.x);
770 		dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x));
771 		cmp0i(dst.y, src1.y, intMax, src1.y);
772 		dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y));
773 		cmp0i(dst.z, src1.z, intMax, src1.z);
774 		dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z));
775 		cmp0i(dst.w, src1.w, intMax, src1.w);
776 		dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w));
777 	}
778 
udiv(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)779 	void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
780 	{
781 		Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
782 		cmp0i(dst.x, src1.x, uintMax, src1.x);
783 		dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x));
784 		cmp0i(dst.y, src1.y, uintMax, src1.y);
785 		dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y));
786 		cmp0i(dst.z, src1.z, uintMax, src1.z);
787 		dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z));
788 		cmp0i(dst.w, src1.w, uintMax, src1.w);
789 		dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w));
790 	}
791 
mod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)792 	void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
793 	{
794 		dst.x = modulo(src0.x, src1.x);
795 		dst.y = modulo(src0.y, src1.y);
796 		dst.z = modulo(src0.z, src1.z);
797 		dst.w = modulo(src0.w, src1.w);
798 	}
799 
imod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)800 	void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
801 	{
802 		cmp0i(dst.x, src1.x, src0.x, src1.x);
803 		dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x));
804 		cmp0i(dst.y, src1.y, src0.y, src1.y);
805 		dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y));
806 		cmp0i(dst.z, src1.z, src0.z, src1.z);
807 		dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z));
808 		cmp0i(dst.w, src1.w, src0.w, src1.w);
809 		dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w));
810 	}
umod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)811 	void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
812 	{
813 		cmp0i(dst.x, src1.x, src0.x, src1.x);
814 		dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x));
815 		cmp0i(dst.y, src1.y, src0.y, src1.y);
816 		dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y));
817 		cmp0i(dst.z, src1.z, src0.z, src1.z);
818 		dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z));
819 		cmp0i(dst.w, src1.w, src0.w, src1.w);
820 		dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w));
821 	}
822 
shl(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)823 	void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
824 	{
825 		dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x));
826 		dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y));
827 		dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z));
828 		dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w));
829 	}
830 
ishr(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)831 	void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
832 	{
833 		dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x));
834 		dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y));
835 		dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z));
836 		dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w));
837 	}
838 
ushr(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)839 	void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
840 	{
841 		dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x));
842 		dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y));
843 		dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z));
844 		dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w));
845 	}
846 
rsqx(Vector4f & dst,const Vector4f & src,bool pp)847 	void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp)
848 	{
849 		Float4 rsq = reciprocalSquareRoot(src.x, true, pp);
850 
851 		dst.x = rsq;
852 		dst.y = rsq;
853 		dst.z = rsq;
854 		dst.w = rsq;
855 	}
856 
sqrt(Vector4f & dst,const Vector4f & src,bool pp)857 	void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp)
858 	{
859 		dst.x = Sqrt(src.x);
860 		dst.y = Sqrt(src.y);
861 		dst.z = Sqrt(src.z);
862 		dst.w = Sqrt(src.w);
863 	}
864 
rsq(Vector4f & dst,const Vector4f & src,bool pp)865 	void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp)
866 	{
867 		dst.x = reciprocalSquareRoot(src.x, false, pp);
868 		dst.y = reciprocalSquareRoot(src.y, false, pp);
869 		dst.z = reciprocalSquareRoot(src.z, false, pp);
870 		dst.w = reciprocalSquareRoot(src.w, false, pp);
871 	}
872 
len2(Float4 & dst,const Vector4f & src,bool pp)873 	void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp)
874 	{
875 		dst = Sqrt(dot2(src, src));
876 	}
877 
len3(Float4 & dst,const Vector4f & src,bool pp)878 	void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp)
879 	{
880 		dst = Sqrt(dot3(src, src));
881 	}
882 
len4(Float4 & dst,const Vector4f & src,bool pp)883 	void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp)
884 	{
885 		dst = Sqrt(dot4(src, src));
886 	}
887 
dist1(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)888 	void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
889 	{
890 		dst = Abs(src0.x - src1.x);
891 	}
892 
dist2(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)893 	void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
894 	{
895 		Float4 dx = src0.x - src1.x;
896 		Float4 dy = src0.y - src1.y;
897 		Float4 dot2 = dx * dx + dy * dy;
898 		dst = Sqrt(dot2);
899 	}
900 
dist3(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)901 	void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
902 	{
903 		Float4 dx = src0.x - src1.x;
904 		Float4 dy = src0.y - src1.y;
905 		Float4 dz = src0.z - src1.z;
906 		Float4 dot3 = dx * dx + dy * dy + dz * dz;
907 		dst = Sqrt(dot3);
908 	}
909 
dist4(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)910 	void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
911 	{
912 		Float4 dx = src0.x - src1.x;
913 		Float4 dy = src0.y - src1.y;
914 		Float4 dz = src0.z - src1.z;
915 		Float4 dw = src0.w - src1.w;
916 		Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw;
917 		dst = Sqrt(dot4);
918 	}
919 
dp1(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)920 	void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
921 	{
922 		Float4 t = src0.x * src1.x;
923 
924 		dst.x = t;
925 		dst.y = t;
926 		dst.z = t;
927 		dst.w = t;
928 	}
929 
dp2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)930 	void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
931 	{
932 		Float4 t = dot2(src0, src1);
933 
934 		dst.x = t;
935 		dst.y = t;
936 		dst.z = t;
937 		dst.w = t;
938 	}
939 
dp2add(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)940 	void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
941 	{
942 		Float4 t = dot2(src0, src1) + src2.x;
943 
944 		dst.x = t;
945 		dst.y = t;
946 		dst.z = t;
947 		dst.w = t;
948 	}
949 
dp3(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)950 	void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
951 	{
952 		Float4 dot = dot3(src0, src1);
953 
954 		dst.x = dot;
955 		dst.y = dot;
956 		dst.z = dot;
957 		dst.w = dot;
958 	}
959 
dp4(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)960 	void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
961 	{
962 		Float4 dot = dot4(src0, src1);
963 
964 		dst.x = dot;
965 		dst.y = dot;
966 		dst.z = dot;
967 		dst.w = dot;
968 	}
969 
min(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)970 	void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
971 	{
972 		dst.x = Min(src0.x, src1.x);
973 		dst.y = Min(src0.y, src1.y);
974 		dst.z = Min(src0.z, src1.z);
975 		dst.w = Min(src0.w, src1.w);
976 	}
977 
imin(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)978 	void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
979 	{
980 		dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x)));
981 		dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y)));
982 		dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z)));
983 		dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w)));
984 	}
985 
umin(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)986 	void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
987 	{
988 		dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x)));
989 		dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y)));
990 		dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z)));
991 		dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w)));
992 	}
993 
max(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)994 	void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
995 	{
996 		dst.x = Max(src0.x, src1.x);
997 		dst.y = Max(src0.y, src1.y);
998 		dst.z = Max(src0.z, src1.z);
999 		dst.w = Max(src0.w, src1.w);
1000 	}
1001 
imax(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1002 	void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1003 	{
1004 		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
1005 		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
1006 		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
1007 		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
1008 	}
1009 
umax(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1010 	void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1011 	{
1012 		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
1013 		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
1014 		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
1015 		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
1016 	}
1017 
slt(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1018 	void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1019 	{
1020 		dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f)));
1021 		dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f)));
1022 		dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f)));
1023 		dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f)));
1024 	}
1025 
step(Vector4f & dst,const Vector4f & edge,const Vector4f & x)1026 	void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x)
1027 	{
1028 		dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f)));
1029 		dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f)));
1030 		dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f)));
1031 		dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f)));
1032 	}
1033 
exp2x(Vector4f & dst,const Vector4f & src,bool pp)1034 	void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp)
1035 	{
1036 		Float4 exp = exponential2(src.x, pp);
1037 
1038 		dst.x = exp;
1039 		dst.y = exp;
1040 		dst.z = exp;
1041 		dst.w = exp;
1042 	}
1043 
exp2(Vector4f & dst,const Vector4f & src,bool pp)1044 	void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp)
1045 	{
1046 		dst.x = exponential2(src.x, pp);
1047 		dst.y = exponential2(src.y, pp);
1048 		dst.z = exponential2(src.z, pp);
1049 		dst.w = exponential2(src.w, pp);
1050 	}
1051 
exp(Vector4f & dst,const Vector4f & src,bool pp)1052 	void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp)
1053 	{
1054 		dst.x = exponential(src.x, pp);
1055 		dst.y = exponential(src.y, pp);
1056 		dst.z = exponential(src.z, pp);
1057 		dst.w = exponential(src.w, pp);
1058 	}
1059 
log2x(Vector4f & dst,const Vector4f & src,bool pp)1060 	void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp)
1061 	{
1062 		Float4 log = logarithm2(src.x, true, pp);
1063 
1064 		dst.x = log;
1065 		dst.y = log;
1066 		dst.z = log;
1067 		dst.w = log;
1068 	}
1069 
log2(Vector4f & dst,const Vector4f & src,bool pp)1070 	void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp)
1071 	{
1072 		dst.x = logarithm2(src.x, pp);
1073 		dst.y = logarithm2(src.y, pp);
1074 		dst.z = logarithm2(src.z, pp);
1075 		dst.w = logarithm2(src.w, pp);
1076 	}
1077 
log(Vector4f & dst,const Vector4f & src,bool pp)1078 	void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp)
1079 	{
1080 		dst.x = logarithm(src.x, false, pp);
1081 		dst.y = logarithm(src.y, false, pp);
1082 		dst.z = logarithm(src.z, false, pp);
1083 		dst.w = logarithm(src.w, false, pp);
1084 	}
1085 
lit(Vector4f & dst,const Vector4f & src)1086 	void ShaderCore::lit(Vector4f &dst, const Vector4f &src)
1087 	{
1088 		dst.x = Float4(1.0f);
1089 		dst.y = Max(src.x, Float4(0.0f));
1090 
1091 		Float4 pow;
1092 
1093 		pow = src.w;
1094 		pow = Min(pow, Float4(127.9961f));
1095 		pow = Max(pow, Float4(-127.9961f));
1096 
1097 		dst.z = power(src.y, pow);
1098 		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f)));
1099 		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f)));
1100 
1101 		dst.w = Float4(1.0f);
1102 	}
1103 
att(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1104 	void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1105 	{
1106 		// Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d
1107 		dst.x = 1;
1108 		dst.y = src0.y * src1.y;
1109 		dst.z = src0.z;
1110 		dst.w = src1.w;
1111 	}
1112 
lrp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1113 	void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1114 	{
1115 		dst.x = src0.x * (src1.x - src2.x) + src2.x;
1116 		dst.y = src0.y * (src1.y - src2.y) + src2.y;
1117 		dst.z = src0.z * (src1.z - src2.z) + src2.z;
1118 		dst.w = src0.w * (src1.w - src2.w) + src2.w;
1119 	}
1120 
smooth(Vector4f & dst,const Vector4f & edge0,const Vector4f & edge1,const Vector4f & x)1121 	void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x)
1122 	{
1123 		Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx);
1124 		Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty);
1125 		Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz);
1126 		Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
1127 	}
1128 
floatToHalfBits(Float4 & dst,const Float4 & floatBits,bool storeInUpperBits)1129 	void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits)
1130 	{
1131 		static const uint32_t mask_sign = 0x80000000u;
1132 		static const uint32_t mask_round = ~0xfffu;
1133 		static const uint32_t c_f32infty = 255 << 23;
1134 		static const uint32_t c_magic = 15 << 23;
1135 		static const uint32_t c_nanbit = 0x200;
1136 		static const uint32_t c_infty_as_fp16 = 0x7c00;
1137 		static const uint32_t c_clamp = (31 << 23) - 0x1000;
1138 
1139 		UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits);
1140 		UInt4 absf = As<UInt4>(floatBits) ^ justsign;
1141 		UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf);
1142 
1143 		// Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
1144 		//       instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
1145 		UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)),
1146 		                                 As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) |
1147 		               ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) |
1148 		               UInt4(c_infty_as_fp16)));
1149 
1150 		dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16));
1151 	}
1152 
halfToFloatBits(Float4 & dst,const Float4 & halfBits)1153 	void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits)
1154 	{
1155 		static const uint32_t mask_nosign = 0x7FFF;
1156 		static const uint32_t magic = (254 - 15) << 23;
1157 		static const uint32_t was_infnan = 0x7BFF;
1158 		static const uint32_t exp_infnan = 255 << 23;
1159 
1160 		UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign);
1161 		dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
1162 		                 ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) |
1163 		                 (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan)));
1164 	}
1165 
packHalf2x16(Vector4f & d,const Vector4f & s0)1166 	void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0)
1167 	{
1168 		// half2 | half1
1169 		floatToHalfBits(d.x, s0.x, false);
1170 		floatToHalfBits(d.x, s0.y, true);
1171 	}
1172 
unpackHalf2x16(Vector4f & dst,const Vector4f & s0)1173 	void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0)
1174 	{
1175 		// half2 | half1
1176 		halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF)));
1177 		halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16));
1178 	}
1179 
packSnorm2x16(Vector4f & d,const Vector4f & s0)1180 	void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0)
1181 	{
1182 		// round(clamp(c, -1.0, 1.0) * 32767.0)
1183 		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) |
1184 		                ((Int4(Round(Min(Max(s0.y, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) << 16));
1185 	}
1186 
packUnorm2x16(Vector4f & d,const Vector4f & s0)1187 	void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0)
1188 	{
1189 		// round(clamp(c, 0.0, 1.0) * 65535.0)
1190 		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) |
1191 		                ((Int4(Round(Min(Max(s0.y, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) << 16));
1192 	}
1193 
unpackSnorm2x16(Vector4f & dst,const Vector4f & s0)1194 	void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0)
1195 	{
1196 		// clamp(f / 32727.0, -1.0, 1.0)
1197 		dst.x = Min(Max(Float4(As<Int4>((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16)) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
1198 		dst.y = Min(Max(Float4(As<Int4>(As<UInt4>(s0.x) & UInt4(0xFFFF0000))) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
1199 	}
1200 
unpackUnorm2x16(Vector4f & dst,const Vector4f & s0)1201 	void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0)
1202 	{
1203 		// f / 65535.0
1204 		dst.x = Float4((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16) * Float4(1.0f / float(0xFFFF0000));
1205 		dst.y = Float4(As<UInt4>(s0.x) & UInt4(0xFFFF0000)) * Float4(1.0f / float(0xFFFF0000));
1206 	}
1207 
det2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1208 	void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1209 	{
1210 		dst.x = src0.x * src1.y - src0.y * src1.x;
1211 		dst.y = dst.z = dst.w = dst.x;
1212 	}
1213 
det3(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1214 	void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1215 	{
1216 		crs(dst, src1, src2);
1217 		dp3(dst, dst, src0);
1218 	}
1219 
det4(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2,const Vector4f & src3)1220 	void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3)
1221 	{
1222 		dst.x = src2.z * src3.w - src2.w * src3.z;
1223 		dst.y = src1.w * src3.z - src1.z * src3.w;
1224 		dst.z = src1.z * src2.w - src1.w * src2.z;
1225 		dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) -
1226 		        src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) +
1227 		        src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) +
1228 		                  src2.x * (src1.w * src3.y - src1.y * src3.w) +
1229 		                  src3.x * (src1.y * src2.w - src1.w * src2.y)) +
1230 		        src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) +
1231 		                  src2.x * (src1.y * src3.z - src1.z * src3.y) +
1232 		                  src3.x * (src1.z * src2.y - src1.y * src2.z));
1233 		dst.y = dst.z = dst.w = dst.x;
1234 	}
1235 
frc(Vector4f & dst,const Vector4f & src)1236 	void ShaderCore::frc(Vector4f &dst, const Vector4f &src)
1237 	{
1238 		dst.x = Frac(src.x);
1239 		dst.y = Frac(src.y);
1240 		dst.z = Frac(src.z);
1241 		dst.w = Frac(src.w);
1242 	}
1243 
trunc(Vector4f & dst,const Vector4f & src)1244 	void ShaderCore::trunc(Vector4f &dst, const Vector4f &src)
1245 	{
1246 		dst.x = Trunc(src.x);
1247 		dst.y = Trunc(src.y);
1248 		dst.z = Trunc(src.z);
1249 		dst.w = Trunc(src.w);
1250 	}
1251 
floor(Vector4f & dst,const Vector4f & src)1252 	void ShaderCore::floor(Vector4f &dst, const Vector4f &src)
1253 	{
1254 		dst.x = Floor(src.x);
1255 		dst.y = Floor(src.y);
1256 		dst.z = Floor(src.z);
1257 		dst.w = Floor(src.w);
1258 	}
1259 
round(Vector4f & dst,const Vector4f & src)1260 	void ShaderCore::round(Vector4f &dst, const Vector4f &src)
1261 	{
1262 		dst.x = Round(src.x);
1263 		dst.y = Round(src.y);
1264 		dst.z = Round(src.z);
1265 		dst.w = Round(src.w);
1266 	}
1267 
roundEven(Vector4f & dst,const Vector4f & src)1268 	void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src)
1269 	{
1270 		// dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src));
1271 		// ex.: 1.5:  2 + (0 * 2 - 1) * 1 * 0 = 2
1272 		//      2.5:  3 + (0 * 2 - 1) * 1 * 1 = 2
1273 		//     -1.5: -2 + (1 * 2 - 1) * 1 * 0 = -2
1274 		//     -2.5: -3 + (1 * 2 - 1) * 1 * 1 = -2
1275 		// Even if the round implementation rounds the other way:
1276 		//      1.5:  1 + (1 * 2 - 1) * 1 * 1 = 2
1277 		//      2.5:  2 + (1 * 2 - 1) * 1 * 0 = 2
1278 		//     -1.5: -1 + (0 * 2 - 1) * 1 * 1 = -2
1279 		//     -2.5: -2 + (0 * 2 - 1) * 1 * 0 = -2
1280 		round(dst, src);
1281 		dst.x += ((Float4(CmpLT(dst.x, src.x) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.x), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.x) & Int4(1));
1282 		dst.y += ((Float4(CmpLT(dst.y, src.y) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.y), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.y) & Int4(1));
1283 		dst.z += ((Float4(CmpLT(dst.z, src.z) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.z), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.z) & Int4(1));
1284 		dst.w += ((Float4(CmpLT(dst.w, src.w) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.w), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.w) & Int4(1));
1285 	}
1286 
ceil(Vector4f & dst,const Vector4f & src)1287 	void ShaderCore::ceil(Vector4f &dst, const Vector4f &src)
1288 	{
1289 		dst.x = Ceil(src.x);
1290 		dst.y = Ceil(src.y);
1291 		dst.z = Ceil(src.z);
1292 		dst.w = Ceil(src.w);
1293 	}
1294 
powx(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1295 	void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
1296 	{
1297 		Float4 pow = power(src0.x, src1.x, pp);
1298 
1299 		dst.x = pow;
1300 		dst.y = pow;
1301 		dst.z = pow;
1302 		dst.w = pow;
1303 	}
1304 
pow(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1305 	void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
1306 	{
1307 		dst.x = power(src0.x, src1.x, pp);
1308 		dst.y = power(src0.y, src1.y, pp);
1309 		dst.z = power(src0.z, src1.z, pp);
1310 		dst.w = power(src0.w, src1.w, pp);
1311 	}
1312 
crs(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1313 	void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1314 	{
1315 		dst.x = src0.y * src1.z - src0.z * src1.y;
1316 		dst.y = src0.z * src1.x - src0.x * src1.z;
1317 		dst.z = src0.x * src1.y - src0.y * src1.x;
1318 	}
1319 
forward1(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1320 	void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1321 	{
1322 		Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000);
1323 
1324 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
1325 	}
1326 
forward2(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1327 	void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1328 	{
1329 		Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000);
1330 
1331 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
1332 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
1333 	}
1334 
forward3(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1335 	void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1336 	{
1337 		Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000);
1338 
1339 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
1340 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
1341 		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
1342 	}
1343 
forward4(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1344 	void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
1345 	{
1346 		Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000);
1347 
1348 		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
1349 		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
1350 		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
1351 		dst.w =  As<Float4>(flip ^ As<Int4>(N.w));
1352 	}
1353 
reflect1(Vector4f & dst,const Vector4f & I,const Vector4f & N)1354 	void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1355 	{
1356 		Float4 d = N.x * I.x;
1357 
1358 		dst.x = I.x - Float4(2.0f) * d * N.x;
1359 	}
1360 
reflect2(Vector4f & dst,const Vector4f & I,const Vector4f & N)1361 	void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1362 	{
1363 		Float4 d = dot2(N, I);
1364 
1365 		dst.x = I.x - Float4(2.0f) * d * N.x;
1366 		dst.y = I.y - Float4(2.0f) * d * N.y;
1367 	}
1368 
reflect3(Vector4f & dst,const Vector4f & I,const Vector4f & N)1369 	void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1370 	{
1371 		Float4 d = dot3(N, I);
1372 
1373 		dst.x = I.x - Float4(2.0f) * d * N.x;
1374 		dst.y = I.y - Float4(2.0f) * d * N.y;
1375 		dst.z = I.z - Float4(2.0f) * d * N.z;
1376 	}
1377 
reflect4(Vector4f & dst,const Vector4f & I,const Vector4f & N)1378 	void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N)
1379 	{
1380 		Float4 d = dot4(N, I);
1381 
1382 		dst.x = I.x - Float4(2.0f) * d * N.x;
1383 		dst.y = I.y - Float4(2.0f) * d * N.y;
1384 		dst.z = I.z - Float4(2.0f) * d * N.z;
1385 		dst.w = I.w - Float4(2.0f) * d * N.w;
1386 	}
1387 
refract1(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1388 	void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1389 	{
1390 		Float4 d = N.x * I.x;
1391 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
1392 		Int4 pos = CmpNLT(k, Float4(0.0f));
1393 		Float4 t = (eta * d + Sqrt(k));
1394 
1395 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1396 	}
1397 
refract2(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1398 	void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1399 	{
1400 		Float4 d = dot2(N, I);
1401 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
1402 		Int4 pos = CmpNLT(k, Float4(0.0f));
1403 		Float4 t = (eta * d + Sqrt(k));
1404 
1405 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1406 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
1407 	}
1408 
refract3(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1409 	void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1410 	{
1411 		Float4 d = dot3(N, I);
1412 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
1413 		Int4 pos = CmpNLT(k, Float4(0.0f));
1414 		Float4 t = (eta * d + Sqrt(k));
1415 
1416 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1417 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
1418 		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
1419 	}
1420 
refract4(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1421 	void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
1422 	{
1423 		Float4 d = dot4(N, I);
1424 		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
1425 		Int4 pos = CmpNLT(k, Float4(0.0f));
1426 		Float4 t = (eta * d + Sqrt(k));
1427 
1428 		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
1429 		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
1430 		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
1431 		dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w));
1432 	}
1433 
sgn(Vector4f & dst,const Vector4f & src)1434 	void ShaderCore::sgn(Vector4f &dst, const Vector4f &src)
1435 	{
1436 		sgn(dst.x, src.x);
1437 		sgn(dst.y, src.y);
1438 		sgn(dst.z, src.z);
1439 		sgn(dst.w, src.w);
1440 	}
1441 
isgn(Vector4f & dst,const Vector4f & src)1442 	void ShaderCore::isgn(Vector4f &dst, const Vector4f &src)
1443 	{
1444 		isgn(dst.x, src.x);
1445 		isgn(dst.y, src.y);
1446 		isgn(dst.z, src.z);
1447 		isgn(dst.w, src.w);
1448 	}
1449 
abs(Vector4f & dst,const Vector4f & src)1450 	void ShaderCore::abs(Vector4f &dst, const Vector4f &src)
1451 	{
1452 		dst.x = Abs(src.x);
1453 		dst.y = Abs(src.y);
1454 		dst.z = Abs(src.z);
1455 		dst.w = Abs(src.w);
1456 	}
1457 
iabs(Vector4f & dst,const Vector4f & src)1458 	void ShaderCore::iabs(Vector4f &dst, const Vector4f &src)
1459 	{
1460 		dst.x = As<Float4>(Abs(As<Int4>(src.x)));
1461 		dst.y = As<Float4>(Abs(As<Int4>(src.y)));
1462 		dst.z = As<Float4>(Abs(As<Int4>(src.z)));
1463 		dst.w = As<Float4>(Abs(As<Int4>(src.w)));
1464 	}
1465 
nrm2(Vector4f & dst,const Vector4f & src,bool pp)1466 	void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp)
1467 	{
1468 		Float4 dot = dot2(src, src);
1469 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
1470 
1471 		dst.x = src.x * rsq;
1472 		dst.y = src.y * rsq;
1473 		dst.z = src.z * rsq;
1474 		dst.w = src.w * rsq;
1475 	}
1476 
nrm3(Vector4f & dst,const Vector4f & src,bool pp)1477 	void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp)
1478 	{
1479 		Float4 dot = dot3(src, src);
1480 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
1481 
1482 		dst.x = src.x * rsq;
1483 		dst.y = src.y * rsq;
1484 		dst.z = src.z * rsq;
1485 		dst.w = src.w * rsq;
1486 	}
1487 
nrm4(Vector4f & dst,const Vector4f & src,bool pp)1488 	void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp)
1489 	{
1490 		Float4 dot = dot4(src, src);
1491 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
1492 
1493 		dst.x = src.x * rsq;
1494 		dst.y = src.y * rsq;
1495 		dst.z = src.z * rsq;
1496 		dst.w = src.w * rsq;
1497 	}
1498 
sincos(Vector4f & dst,const Vector4f & src,bool pp)1499 	void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp)
1500 	{
1501 		dst.x = cosine_pi(src.x, pp);
1502 		dst.y = sine_pi(src.x, pp);
1503 	}
1504 
cos(Vector4f & dst,const Vector4f & src,bool pp)1505 	void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp)
1506 	{
1507 		dst.x = cosine(src.x, pp);
1508 		dst.y = cosine(src.y, pp);
1509 		dst.z = cosine(src.z, pp);
1510 		dst.w = cosine(src.w, pp);
1511 	}
1512 
sin(Vector4f & dst,const Vector4f & src,bool pp)1513 	void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp)
1514 	{
1515 		dst.x = sine(src.x, pp);
1516 		dst.y = sine(src.y, pp);
1517 		dst.z = sine(src.z, pp);
1518 		dst.w = sine(src.w, pp);
1519 	}
1520 
tan(Vector4f & dst,const Vector4f & src,bool pp)1521 	void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp)
1522 	{
1523 		dst.x = tangent(src.x, pp);
1524 		dst.y = tangent(src.y, pp);
1525 		dst.z = tangent(src.z, pp);
1526 		dst.w = tangent(src.w, pp);
1527 	}
1528 
acos(Vector4f & dst,const Vector4f & src,bool pp)1529 	void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp)
1530 	{
1531 		dst.x = arccos(src.x, pp);
1532 		dst.y = arccos(src.y, pp);
1533 		dst.z = arccos(src.z, pp);
1534 		dst.w = arccos(src.w, pp);
1535 	}
1536 
asin(Vector4f & dst,const Vector4f & src,bool pp)1537 	void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp)
1538 	{
1539 		dst.x = arcsin(src.x, pp);
1540 		dst.y = arcsin(src.y, pp);
1541 		dst.z = arcsin(src.z, pp);
1542 		dst.w = arcsin(src.w, pp);
1543 	}
1544 
atan(Vector4f & dst,const Vector4f & src,bool pp)1545 	void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp)
1546 	{
1547 		dst.x = arctan(src.x, pp);
1548 		dst.y = arctan(src.y, pp);
1549 		dst.z = arctan(src.z, pp);
1550 		dst.w = arctan(src.w, pp);
1551 	}
1552 
atan2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1553 	void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
1554 	{
1555 		dst.x = arctan(src0.x, src1.x, pp);
1556 		dst.y = arctan(src0.y, src1.y, pp);
1557 		dst.z = arctan(src0.z, src1.z, pp);
1558 		dst.w = arctan(src0.w, src1.w, pp);
1559 	}
1560 
cosh(Vector4f & dst,const Vector4f & src,bool pp)1561 	void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp)
1562 	{
1563 		dst.x = cosineh(src.x, pp);
1564 		dst.y = cosineh(src.y, pp);
1565 		dst.z = cosineh(src.z, pp);
1566 		dst.w = cosineh(src.w, pp);
1567 	}
1568 
sinh(Vector4f & dst,const Vector4f & src,bool pp)1569 	void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp)
1570 	{
1571 		dst.x = sineh(src.x, pp);
1572 		dst.y = sineh(src.y, pp);
1573 		dst.z = sineh(src.z, pp);
1574 		dst.w = sineh(src.w, pp);
1575 	}
1576 
tanh(Vector4f & dst,const Vector4f & src,bool pp)1577 	void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp)
1578 	{
1579 		dst.x = tangenth(src.x, pp);
1580 		dst.y = tangenth(src.y, pp);
1581 		dst.z = tangenth(src.z, pp);
1582 		dst.w = tangenth(src.w, pp);
1583 	}
1584 
acosh(Vector4f & dst,const Vector4f & src,bool pp)1585 	void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp)
1586 	{
1587 		dst.x = arccosh(src.x, pp);
1588 		dst.y = arccosh(src.y, pp);
1589 		dst.z = arccosh(src.z, pp);
1590 		dst.w = arccosh(src.w, pp);
1591 	}
1592 
asinh(Vector4f & dst,const Vector4f & src,bool pp)1593 	void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp)
1594 	{
1595 		dst.x = arcsinh(src.x, pp);
1596 		dst.y = arcsinh(src.y, pp);
1597 		dst.z = arcsinh(src.z, pp);
1598 		dst.w = arcsinh(src.w, pp);
1599 	}
1600 
atanh(Vector4f & dst,const Vector4f & src,bool pp)1601 	void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp)
1602 	{
1603 		dst.x = arctanh(src.x, pp);
1604 		dst.y = arctanh(src.y, pp);
1605 		dst.z = arctanh(src.z, pp);
1606 		dst.w = arctanh(src.w, pp);
1607 	}
1608 
expp(Vector4f & dst,const Vector4f & src,unsigned short version)1609 	void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short version)
1610 	{
1611 		if(version < 0x0200)
1612 		{
1613 			Float4 frc = Frac(src.x);
1614 			Float4 floor = src.x - frc;
1615 
1616 			dst.x = exponential2(floor, true);
1617 			dst.y = frc;
1618 			dst.z = exponential2(src.x, true);
1619 			dst.w = Float4(1.0f);
1620 		}
1621 		else   // Version >= 2.0
1622 		{
1623 			exp2x(dst, src, true);   // FIXME: 10-bit precision suffices
1624 		}
1625 	}
1626 
logp(Vector4f & dst,const Vector4f & src,unsigned short version)1627 	void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short version)
1628 	{
1629 		if(version < 0x0200)
1630 		{
1631 			Float4 tmp0;
1632 			Float4 tmp1;
1633 			Float4 t;
1634 			Int4 r;
1635 
1636 			tmp0 = Abs(src.x);
1637 			tmp1 = tmp0;
1638 
1639 			// X component
1640 			r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127);
1641 			dst.x = Float4(r);
1642 
1643 			// Y component
1644 			dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
1645 
1646 			// Z component
1647 			dst.z = logarithm2(src.x, true, true);
1648 
1649 			// W component
1650 			dst.w = 1.0f;
1651 		}
1652 		else
1653 		{
1654 			log2x(dst, src, true);
1655 		}
1656 	}
1657 
cmp0(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1658 	void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1659 	{
1660 		cmp0(dst.x, src0.x, src1.x, src2.x);
1661 		cmp0(dst.y, src0.y, src1.y, src2.y);
1662 		cmp0(dst.z, src0.z, src1.z, src2.z);
1663 		cmp0(dst.w, src0.w, src1.w, src2.w);
1664 	}
1665 
select(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1666 	void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
1667 	{
1668 		select(dst.x, As<Int4>(src0.x), src1.x, src2.x);
1669 		select(dst.y, As<Int4>(src0.y), src1.y, src2.y);
1670 		select(dst.z, As<Int4>(src0.z), src1.z, src2.z);
1671 		select(dst.w, As<Int4>(src0.w), src1.w, src2.w);
1672 	}
1673 
extract(Float4 & dst,const Vector4f & src0,const Float4 & src1)1674 	void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1)
1675 	{
1676 		select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x);
1677 		select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst);
1678 		select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst);
1679 	}
1680 
insert(Vector4f & dst,const Vector4f & src,const Float4 & element,const Float4 & index)1681 	void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index)
1682 	{
1683 		select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x);
1684 		select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y);
1685 		select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z);
1686 		select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w);
1687 	}
1688 
sgn(Float4 & dst,const Float4 & src)1689 	void ShaderCore::sgn(Float4 &dst, const Float4 &src)
1690 	{
1691 		Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f));
1692 		Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f));
1693 		dst = As<Float4>(neg | pos);
1694 	}
1695 
isgn(Float4 & dst,const Float4 & src)1696 	void ShaderCore::isgn(Float4 &dst, const Float4 &src)
1697 	{
1698 		Int4 neg = CmpLT(As<Int4>(src), Int4(0)) & Int4(-1);
1699 		Int4 pos = CmpNLE(As<Int4>(src), Int4(0)) & Int4(1);
1700 		dst = As<Float4>(neg | pos);
1701 	}
1702 
cmp0(Float4 & dst,const Float4 & src0,const Float4 & src1,const Float4 & src2)1703 	void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
1704 	{
1705 		Int4 pos = CmpLE(Float4(0.0f), src0);
1706 		select(dst, pos, src1, src2);
1707 	}
1708 
cmp0i(Float4 & dst,const Float4 & src0,const Float4 & src1,const Float4 & src2)1709 	void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
1710 	{
1711 		Int4 pos = CmpEQ(Int4(0), As<Int4>(src0));
1712 		select(dst, pos, src1, src2);
1713 	}
1714 
select(Float4 & dst,RValue<Int4> src0,const Float4 & src1,const Float4 & src2)1715 	void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2)
1716 	{
1717 		// FIXME: LLVM vector select
1718 		dst = As<Float4>(src0 & As<Int4>(src1) | ~src0 & As<Int4>(src2));
1719 	}
1720 
cmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1721 	void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
1722 	{
1723 		switch(control)
1724 		{
1725 		case Shader::CONTROL_GT:
1726 			dst.x = As<Float4>(CmpNLE(src0.x, src1.x));
1727 			dst.y = As<Float4>(CmpNLE(src0.y, src1.y));
1728 			dst.z = As<Float4>(CmpNLE(src0.z, src1.z));
1729 			dst.w = As<Float4>(CmpNLE(src0.w, src1.w));
1730 			break;
1731 		case Shader::CONTROL_EQ:
1732 			dst.x = As<Float4>(CmpEQ(src0.x, src1.x));
1733 			dst.y = As<Float4>(CmpEQ(src0.y, src1.y));
1734 			dst.z = As<Float4>(CmpEQ(src0.z, src1.z));
1735 			dst.w = As<Float4>(CmpEQ(src0.w, src1.w));
1736 			break;
1737 		case Shader::CONTROL_GE:
1738 			dst.x = As<Float4>(CmpNLT(src0.x, src1.x));
1739 			dst.y = As<Float4>(CmpNLT(src0.y, src1.y));
1740 			dst.z = As<Float4>(CmpNLT(src0.z, src1.z));
1741 			dst.w = As<Float4>(CmpNLT(src0.w, src1.w));
1742 			break;
1743 		case Shader::CONTROL_LT:
1744 			dst.x = As<Float4>(CmpLT(src0.x, src1.x));
1745 			dst.y = As<Float4>(CmpLT(src0.y, src1.y));
1746 			dst.z = As<Float4>(CmpLT(src0.z, src1.z));
1747 			dst.w = As<Float4>(CmpLT(src0.w, src1.w));
1748 			break;
1749 		case Shader::CONTROL_NE:
1750 			dst.x = As<Float4>(CmpNEQ(src0.x, src1.x));
1751 			dst.y = As<Float4>(CmpNEQ(src0.y, src1.y));
1752 			dst.z = As<Float4>(CmpNEQ(src0.z, src1.z));
1753 			dst.w = As<Float4>(CmpNEQ(src0.w, src1.w));
1754 			break;
1755 		case Shader::CONTROL_LE:
1756 			dst.x = As<Float4>(CmpLE(src0.x, src1.x));
1757 			dst.y = As<Float4>(CmpLE(src0.y, src1.y));
1758 			dst.z = As<Float4>(CmpLE(src0.z, src1.z));
1759 			dst.w = As<Float4>(CmpLE(src0.w, src1.w));
1760 			break;
1761 		default:
1762 			ASSERT(false);
1763 		}
1764 	}
1765 
icmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1766 	void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
1767 	{
1768 		switch(control)
1769 		{
1770 		case Shader::CONTROL_GT:
1771 			dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x)));
1772 			dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y)));
1773 			dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z)));
1774 			dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w)));
1775 			break;
1776 		case Shader::CONTROL_EQ:
1777 			dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
1778 			dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
1779 			dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
1780 			dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
1781 			break;
1782 		case Shader::CONTROL_GE:
1783 			dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x)));
1784 			dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y)));
1785 			dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z)));
1786 			dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w)));
1787 			break;
1788 		case Shader::CONTROL_LT:
1789 			dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x)));
1790 			dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y)));
1791 			dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z)));
1792 			dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w)));
1793 			break;
1794 		case Shader::CONTROL_NE:
1795 			dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
1796 			dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
1797 			dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
1798 			dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
1799 			break;
1800 		case Shader::CONTROL_LE:
1801 			dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x)));
1802 			dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y)));
1803 			dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z)));
1804 			dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w)));
1805 			break;
1806 		default:
1807 			ASSERT(false);
1808 		}
1809 	}
1810 
ucmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1811 	void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
1812 	{
1813 		switch(control)
1814 		{
1815 		case Shader::CONTROL_GT:
1816 			dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1817 			dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1818 			dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1819 			dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1820 			break;
1821 		case Shader::CONTROL_EQ:
1822 			dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1823 			dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1824 			dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1825 			dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1826 			break;
1827 		case Shader::CONTROL_GE:
1828 			dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1829 			dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1830 			dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1831 			dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1832 			break;
1833 		case Shader::CONTROL_LT:
1834 			dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1835 			dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1836 			dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1837 			dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1838 			break;
1839 		case Shader::CONTROL_NE:
1840 			dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1841 			dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1842 			dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1843 			dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1844 			break;
1845 		case Shader::CONTROL_LE:
1846 			dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
1847 			dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
1848 			dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
1849 			dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1850 			break;
1851 		default:
1852 			ASSERT(false);
1853 		}
1854 	}
1855 
all(Float4 & dst,const Vector4f & src)1856 	void ShaderCore::all(Float4 &dst, const Vector4f &src)
1857 	{
1858 		dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w));
1859 	}
1860 
any(Float4 & dst,const Vector4f & src)1861 	void ShaderCore::any(Float4 &dst, const Vector4f &src)
1862 	{
1863 		dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w));
1864 	}
1865 
not(Vector4f & dst,const Vector4f & src)1866 	void ShaderCore::not(Vector4f &dst, const Vector4f &src)
1867 	{
1868 		dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF));
1869 		dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF));
1870 		dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF));
1871 		dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF));
1872 	}
1873 
or(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1874 	void ShaderCore::or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1875 	{
1876 		dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x));
1877 		dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y));
1878 		dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z));
1879 		dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w));
1880 	}
1881 
xor(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1882 	void ShaderCore::xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1883 	{
1884 		dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x));
1885 		dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y));
1886 		dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z));
1887 		dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w));
1888 	}
1889 
and(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1890 	void ShaderCore::and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1891 	{
1892 		dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x));
1893 		dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y));
1894 		dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z));
1895 		dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w));
1896 	}
1897 
equal(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1898 	void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1899 	{
1900 		dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) &
1901 		                   CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) &
1902 		                   CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) &
1903 		                   CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1904 		dst.y = dst.x;
1905 		dst.z = dst.x;
1906 		dst.w = dst.x;
1907 	}
1908 
notEqual(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1909 	void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
1910 	{
1911 		dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) |
1912 		                   CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) |
1913 		                   CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) |
1914 		                   CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
1915 		dst.y = dst.x;
1916 		dst.z = dst.x;
1917 		dst.w = dst.x;
1918 	}
1919 }
1920