1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelRoutine.hpp"
16 
17 #include "SamplerCore.hpp"
18 #include "Constants.hpp"
19 #include "Renderer/Renderer.hpp"
20 #include "Renderer/QuadRasterizer.hpp"
21 #include "Renderer/Surface.hpp"
22 #include "Renderer/Primitive.hpp"
23 #include "Common/Debug.hpp"
24 
25 namespace sw
26 {
27 	extern bool complementaryDepthBuffer;
28 	extern bool postBlendSRGB;
29 	extern bool exactColorRounding;
30 	extern bool forceClearRegisters;
31 
PixelRoutine(const PixelProcessor::State & state,const PixelShader * shader)32 	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
33 	{
34 		if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters)
35 		{
36 			for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
37 			{
38 				v[i].x = Float4(0.0f);
39 				v[i].y = Float4(0.0f);
40 				v[i].z = Float4(0.0f);
41 				v[i].w = Float4(0.0f);
42 			}
43 		}
44 	}
45 
~PixelRoutine()46 	PixelRoutine::~PixelRoutine()
47 	{
48 	}
49 
quad(Pointer<Byte> cBuffer[RENDERTARGETS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)50 	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
51 	{
52 		#if PERF_PROFILE
53 			Long pipeTime = Ticks();
54 		#endif
55 
56 		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
57 
58 		Int zMask[4];   // Depth mask
59 		Int sMask[4];   // Stencil mask
60 
61 		for(unsigned int q = 0; q < state.multiSample; q++)
62 		{
63 			zMask[q] = cMask[q];
64 			sMask[q] = cMask[q];
65 		}
66 
67 		for(unsigned int q = 0; q < state.multiSample; q++)
68 		{
69 			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
70 		}
71 
72 		Float4 f;
73 		Float4 rhwCentroid;
74 
75 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
76 
77 		if(interpolateZ())
78 		{
79 			for(unsigned int q = 0; q < state.multiSample; q++)
80 			{
81 				Float4 x = xxxx;
82 
83 				if(state.multiSample > 1)
84 				{
85 					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
86 				}
87 
88 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
89 			}
90 		}
91 
92 		Bool depthPass = false;
93 
94 		if(earlyDepthTest)
95 		{
96 			for(unsigned int q = 0; q < state.multiSample; q++)
97 			{
98 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
99 			}
100 		}
101 
102 		If(depthPass || Bool(!earlyDepthTest))
103 		{
104 			#if PERF_PROFILE
105 				Long interpTime = Ticks();
106 			#endif
107 
108 			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
109 
110 			// Centroid locations
111 			Float4 XXXX = Float4(0.0f);
112 			Float4 YYYY = Float4(0.0f);
113 
114 			if(state.centroid)
115 			{
116 				Float4 WWWW(1.0e-9f);
117 
118 				for(unsigned int q = 0; q < state.multiSample; q++)
119 				{
120 					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
121 					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
122 					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
123 				}
124 
125 				WWWW = Rcp_pp(WWWW);
126 				XXXX *= WWWW;
127 				YYYY *= WWWW;
128 
129 				XXXX += xxxx;
130 				YYYY += yyyy;
131 			}
132 
133 			if(interpolateW())
134 			{
135 				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
136 				rhw = reciprocal(w, false, false, true);
137 
138 				if(state.centroid)
139 				{
140 					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
141 				}
142 			}
143 
144 			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
145 			{
146 				for(int component = 0; component < 4; component++)
147 				{
148 					if(state.interpolant[interpolant].component & (1 << component))
149 					{
150 						if(!state.interpolant[interpolant].centroid)
151 						{
152 							v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false);
153 						}
154 						else
155 						{
156 							v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
157 						}
158 					}
159 				}
160 
161 				Float4 rcp;
162 
163 				switch(state.interpolant[interpolant].project)
164 				{
165 				case 0:
166 					break;
167 				case 1:
168 					rcp = reciprocal(v[interpolant].y);
169 					v[interpolant].x = v[interpolant].x * rcp;
170 					break;
171 				case 2:
172 					rcp = reciprocal(v[interpolant].z);
173 					v[interpolant].x = v[interpolant].x * rcp;
174 					v[interpolant].y = v[interpolant].y * rcp;
175 					break;
176 				case 3:
177 					rcp = reciprocal(v[interpolant].w);
178 					v[interpolant].x = v[interpolant].x * rcp;
179 					v[interpolant].y = v[interpolant].y * rcp;
180 					v[interpolant].z = v[interpolant].z * rcp;
181 					break;
182 				}
183 			}
184 
185 			if(state.fog.component)
186 			{
187 				f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false);
188 			}
189 
190 			setBuiltins(x, y, z, w);
191 
192 			#if PERF_PROFILE
193 				cycles[PERF_INTERP] += Ticks() - interpTime;
194 			#endif
195 
196 			Bool alphaPass = true;
197 
198 			if(colorUsed())
199 			{
200 				#if PERF_PROFILE
201 					Long shaderTime = Ticks();
202 				#endif
203 
204 				applyShader(cMask);
205 
206 				#if PERF_PROFILE
207 					cycles[PERF_SHADER] += Ticks() - shaderTime;
208 				#endif
209 
210 				alphaPass = alphaTest(cMask);
211 
212 				if((shader && shader->containsKill()) || state.alphaTestActive())
213 				{
214 					for(unsigned int q = 0; q < state.multiSample; q++)
215 					{
216 						zMask[q] &= cMask[q];
217 						sMask[q] &= cMask[q];
218 					}
219 				}
220 			}
221 
222 			If(alphaPass)
223 			{
224 				if(!earlyDepthTest)
225 				{
226 					for(unsigned int q = 0; q < state.multiSample; q++)
227 					{
228 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
229 					}
230 				}
231 
232 				#if PERF_PROFILE
233 					Long ropTime = Ticks();
234 				#endif
235 
236 				If(depthPass || Bool(earlyDepthTest))
237 				{
238 					for(unsigned int q = 0; q < state.multiSample; q++)
239 					{
240 						if(state.multiSampleMask & (1 << q))
241 						{
242 							writeDepth(zBuffer, q, x, z[q], zMask[q]);
243 
244 							if(state.occlusionEnabled)
245 							{
246 								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
247 							}
248 						}
249 					}
250 
251 					if(colorUsed())
252 					{
253 						#if PERF_PROFILE
254 							AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
255 						#endif
256 
257 						rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
258 					}
259 				}
260 
261 				#if PERF_PROFILE
262 					cycles[PERF_ROP] += Ticks() - ropTime;
263 				#endif
264 			}
265 		}
266 
267 		for(unsigned int q = 0; q < state.multiSample; q++)
268 		{
269 			if(state.multiSampleMask & (1 << q))
270 			{
271 				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
272 			}
273 		}
274 
275 		#if PERF_PROFILE
276 			cycles[PERF_PIPE] += Ticks() - pipeTime;
277 		#endif
278 	}
279 
interpolateCentroid(Float4 & x,Float4 & y,Float4 & rhw,Pointer<Byte> planeEquation,bool flat,bool perspective)280 	Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
281 	{
282 		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
283 
284 		if(!flat)
285 		{
286 			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
287 			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
288 
289 			if(perspective)
290 			{
291 				interpolant *= rhw;
292 			}
293 		}
294 
295 		return interpolant;
296 	}
297 
stencilTest(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & cMask)298 	void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
299 	{
300 		if(!state.stencilActive)
301 		{
302 			return;
303 		}
304 
305 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
306 
307 		Pointer<Byte> buffer = sBuffer + 2 * x;
308 
309 		if(q > 0)
310 		{
311 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
312 		}
313 
314 		Byte8 value = *Pointer<Byte8>(buffer);
315 		Byte8 valueCCW = value;
316 
317 		if(!state.noStencilMask)
318 		{
319 			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
320 		}
321 
322 		stencilTest(value, state.stencilCompareMode, false);
323 
324 		if(state.twoSidedStencil)
325 		{
326 			if(!state.noStencilMaskCCW)
327 			{
328 				valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
329 			}
330 
331 			stencilTest(valueCCW, state.stencilCompareModeCCW, true);
332 
333 			value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
334 			valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
335 			value |= valueCCW;
336 		}
337 
338 		sMask = SignMask(value) & cMask;
339 	}
340 
stencilTest(Byte8 & value,StencilCompareMode stencilCompareMode,bool CCW)341 	void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
342 	{
343 		Byte8 equal;
344 
345 		switch(stencilCompareMode)
346 		{
347 		case STENCIL_ALWAYS:
348 			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
349 			break;
350 		case STENCIL_NEVER:
351 			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
352 			break;
353 		case STENCIL_LESS:			// a < b ~ b > a
354 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
355 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
356 			break;
357 		case STENCIL_EQUAL:
358 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
359 			break;
360 		case STENCIL_NOTEQUAL:		// a != b ~ !(a == b)
361 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
362 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
363 			break;
364 		case STENCIL_LESSEQUAL:	// a <= b ~ (b > a) || (a == b)
365 			equal = value;
366 			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
367 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
368 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
369 			value |= equal;
370 			break;
371 		case STENCIL_GREATER:		// a > b
372 			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
373 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
374 			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
375 			value = equal;
376 			break;
377 		case STENCIL_GREATEREQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
378 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
379 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
380 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
381 			break;
382 		default:
383 			ASSERT(false);
384 		}
385 	}
386 
depthTest(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & sMask,Int & zMask,Int & cMask)387 	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
388 	{
389 		if(!state.depthTestActive)
390 		{
391 			return true;
392 		}
393 
394 		Float4 Z = z;
395 
396 		if(shader && shader->depthOverride())
397 		{
398 			if(complementaryDepthBuffer)
399 			{
400 				Z = Float4(1.0f) - oDepth;
401 			}
402 			else
403 			{
404 				Z = oDepth;
405 			}
406 		}
407 
408 		Pointer<Byte> buffer;
409 		Int pitch;
410 
411 		if(!state.quadLayoutDepthBuffer)
412 		{
413 			buffer = zBuffer + 4 * x;
414 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
415 		}
416 		else
417 		{
418 			buffer = zBuffer + 8 * x;
419 		}
420 
421 		if(q > 0)
422 		{
423 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
424 		}
425 
426 		Float4 zValue;
427 
428 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
429 		{
430 			if(!state.quadLayoutDepthBuffer)
431 			{
432 				// FIXME: Properly optimizes?
433 				zValue.xy = *Pointer<Float4>(buffer);
434 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
435 			}
436 			else
437 			{
438 				zValue = *Pointer<Float4>(buffer, 16);
439 			}
440 		}
441 
442 		Int4 zTest;
443 
444 		switch(state.depthCompareMode)
445 		{
446 		case DEPTH_ALWAYS:
447 			// Optimized
448 			break;
449 		case DEPTH_NEVER:
450 			// Optimized
451 			break;
452 		case DEPTH_EQUAL:
453 			zTest = CmpEQ(zValue, Z);
454 			break;
455 		case DEPTH_NOTEQUAL:
456 			zTest = CmpNEQ(zValue, Z);
457 			break;
458 		case DEPTH_LESS:
459 			if(complementaryDepthBuffer)
460 			{
461 				zTest = CmpLT(zValue, Z);
462 			}
463 			else
464 			{
465 				zTest = CmpNLE(zValue, Z);
466 			}
467 			break;
468 		case DEPTH_GREATEREQUAL:
469 			if(complementaryDepthBuffer)
470 			{
471 				zTest = CmpNLT(zValue, Z);
472 			}
473 			else
474 			{
475 				zTest = CmpLE(zValue, Z);
476 			}
477 			break;
478 		case DEPTH_LESSEQUAL:
479 			if(complementaryDepthBuffer)
480 			{
481 				zTest = CmpLE(zValue, Z);
482 			}
483 			else
484 			{
485 				zTest = CmpNLT(zValue, Z);
486 			}
487 			break;
488 		case DEPTH_GREATER:
489 			if(complementaryDepthBuffer)
490 			{
491 				zTest = CmpNLE(zValue, Z);
492 			}
493 			else
494 			{
495 				zTest = CmpLT(zValue, Z);
496 			}
497 			break;
498 		default:
499 			ASSERT(false);
500 		}
501 
502 		switch(state.depthCompareMode)
503 		{
504 		case DEPTH_ALWAYS:
505 			zMask = cMask;
506 			break;
507 		case DEPTH_NEVER:
508 			zMask = 0x0;
509 			break;
510 		default:
511 			zMask = SignMask(zTest) & cMask;
512 			break;
513 		}
514 
515 		if(state.stencilActive)
516 		{
517 			zMask &= sMask;
518 		}
519 
520 		return zMask != 0;
521 	}
522 
alphaTest(Int & aMask,Short4 & alpha)523 	void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
524 	{
525 		Short4 cmp;
526 		Short4 equal;
527 
528 		switch(state.alphaCompareMode)
529 		{
530 		case ALPHA_ALWAYS:
531 			aMask = 0xF;
532 			break;
533 		case ALPHA_NEVER:
534 			aMask = 0x0;
535 			break;
536 		case ALPHA_EQUAL:
537 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
538 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
539 			break;
540 		case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
541 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
542 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
543 			break;
544 		case ALPHA_LESS:           // a < b ~ b > a
545 			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
546 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
547 			break;
548 		case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
549 			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
550 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
551 			cmp |= equal;
552 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
553 			break;
554 		case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
555 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
556 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
557 			break;
558 		case ALPHA_GREATER:        // a > b
559 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
560 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
561 			break;
562 		default:
563 			ASSERT(false);
564 		}
565 	}
566 
alphaToCoverage(Int cMask[4],Float4 & alpha)567 	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
568 	{
569 		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
570 		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
571 		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
572 		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
573 
574 		Int aMask0 = SignMask(coverage0);
575 		Int aMask1 = SignMask(coverage1);
576 		Int aMask2 = SignMask(coverage2);
577 		Int aMask3 = SignMask(coverage3);
578 
579 		cMask[0] &= aMask0;
580 		cMask[1] &= aMask1;
581 		cMask[2] &= aMask2;
582 		cMask[3] &= aMask3;
583 	}
584 
fogBlend(Vector4f & c0,Float4 & fog)585 	void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
586 	{
587 		if(!state.fogActive)
588 		{
589 			return;
590 		}
591 
592 		if(state.pixelFogMode != FOG_NONE)
593 		{
594 			pixelFog(fog);
595 
596 			fog = Min(fog, Float4(1.0f));
597 			fog = Max(fog, Float4(0.0f));
598 		}
599 
600 		c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
601 		c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
602 		c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
603 
604 		c0.x *= fog;
605 		c0.y *= fog;
606 		c0.z *= fog;
607 
608 		c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
609 		c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
610 		c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
611 	}
612 
pixelFog(Float4 & visibility)613 	void PixelRoutine::pixelFog(Float4 &visibility)
614 	{
615 		Float4 &zw = visibility;
616 
617 		if(state.pixelFogMode != FOG_NONE)
618 		{
619 			if(state.wBasedFog)
620 			{
621 				zw = rhw;
622 			}
623 			else
624 			{
625 				if(complementaryDepthBuffer)
626 				{
627 					zw = Float4(1.0f) - z[0];
628 				}
629 				else
630 				{
631 					zw = z[0];
632 				}
633 			}
634 		}
635 
636 		switch(state.pixelFogMode)
637 		{
638 		case FOG_NONE:
639 			break;
640 		case FOG_LINEAR:
641 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
642 			zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
643 			break;
644 		case FOG_EXP:
645 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
646 			zw = exponential2(zw, true);
647 			break;
648 		case FOG_EXP2:
649 			zw *= zw;
650 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
651 			zw = exponential2(zw, true);
652 			break;
653 		default:
654 			ASSERT(false);
655 		}
656 	}
657 
writeDepth(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & zMask)658 	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
659 	{
660 		if(!state.depthWriteEnable)
661 		{
662 			return;
663 		}
664 
665 		Float4 Z = z;
666 
667 		if(shader && shader->depthOverride())
668 		{
669 			if(complementaryDepthBuffer)
670 			{
671 				Z = Float4(1.0f) - oDepth;
672 			}
673 			else
674 			{
675 				Z = oDepth;
676 			}
677 		}
678 
679 		Pointer<Byte> buffer;
680 		Int pitch;
681 
682 		if(!state.quadLayoutDepthBuffer)
683 		{
684 			buffer = zBuffer + 4 * x;
685 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
686 		}
687 		else
688 		{
689 			buffer = zBuffer + 8 * x;
690 		}
691 
692 		if(q > 0)
693 		{
694 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
695 		}
696 
697 		Float4 zValue;
698 
699 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
700 		{
701 			if(!state.quadLayoutDepthBuffer)
702 			{
703 				// FIXME: Properly optimizes?
704 				zValue.xy = *Pointer<Float4>(buffer);
705 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
706 			}
707 			else
708 			{
709 				zValue = *Pointer<Float4>(buffer, 16);
710 			}
711 		}
712 
713 		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
714 		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
715 		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
716 
717 		if(!state.quadLayoutDepthBuffer)
718 		{
719 			// FIXME: Properly optimizes?
720 			*Pointer<Float2>(buffer) = Float2(Z.xy);
721 			*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
722 		}
723 		else
724 		{
725 			*Pointer<Float4>(buffer, 16) = Z;
726 		}
727 	}
728 
writeStencil(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & zMask,Int & cMask)729 	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
730 	{
731 		if(!state.stencilActive)
732 		{
733 			return;
734 		}
735 
736 		if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
737 		{
738 			if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
739 			{
740 				return;
741 			}
742 		}
743 
744 		if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
745 		{
746 			return;
747 		}
748 
749 		Pointer<Byte> buffer = sBuffer + 2 * x;
750 
751 		if(q > 0)
752 		{
753 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
754 		}
755 
756 		Byte8 bufferValue = *Pointer<Byte8>(buffer);
757 
758 		Byte8 newValue;
759 		stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
760 
761 		if(!state.noStencilWriteMask)
762 		{
763 			Byte8 maskedValue = bufferValue;
764 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
765 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
766 			newValue |= maskedValue;
767 		}
768 
769 		if(state.twoSidedStencil)
770 		{
771 			Byte8 newValueCCW;
772 
773 			stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
774 
775 			if(!state.noStencilWriteMaskCCW)
776 			{
777 				Byte8 maskedValue = bufferValue;
778 				newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
779 				maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
780 				newValueCCW |= maskedValue;
781 			}
782 
783 			newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
784 			newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
785 			newValue |= newValueCCW;
786 		}
787 
788 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
789 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
790 		newValue |= bufferValue;
791 
792 		*Pointer<Byte4>(buffer) = Byte4(newValue);
793 	}
794 
stencilOperation(Byte8 & newValue,Byte8 & bufferValue,StencilOperation stencilPassOperation,StencilOperation stencilZFailOperation,StencilOperation stencilFailOperation,bool CCW,Int & zMask,Int & sMask)795 	void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
796 	{
797 		Byte8 &pass = newValue;
798 		Byte8 fail;
799 		Byte8 zFail;
800 
801 		stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
802 
803 		if(stencilZFailOperation != stencilPassOperation)
804 		{
805 			stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
806 		}
807 
808 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
809 		{
810 			stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
811 		}
812 
813 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
814 		{
815 			if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
816 			{
817 				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
818 				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
819 				pass |= zFail;
820 			}
821 
822 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
823 			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
824 			pass |= fail;
825 		}
826 	}
827 
stencilOperation(Byte8 & output,Byte8 & bufferValue,StencilOperation operation,bool CCW)828 	void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
829 	{
830 		switch(operation)
831 		{
832 		case OPERATION_KEEP:
833 			output = bufferValue;
834 			break;
835 		case OPERATION_ZERO:
836 			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
837 			break;
838 		case OPERATION_REPLACE:
839 			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
840 			break;
841 		case OPERATION_INCRSAT:
842 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
843 			break;
844 		case OPERATION_DECRSAT:
845 			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
846 			break;
847 		case OPERATION_INVERT:
848 			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
849 			break;
850 		case OPERATION_INCR:
851 			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
852 			break;
853 		case OPERATION_DECR:
854 			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
855 			break;
856 		default:
857 			ASSERT(false);
858 		}
859 	}
860 
blendFactor(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorActive)861 	void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
862 	{
863 		switch(blendFactorActive)
864 		{
865 		case BLEND_ZERO:
866 			// Optimized
867 			break;
868 		case BLEND_ONE:
869 			// Optimized
870 			break;
871 		case BLEND_SOURCE:
872 			blendFactor.x = current.x;
873 			blendFactor.y = current.y;
874 			blendFactor.z = current.z;
875 			break;
876 		case BLEND_INVSOURCE:
877 			blendFactor.x = Short4(0xFFFFu) - current.x;
878 			blendFactor.y = Short4(0xFFFFu) - current.y;
879 			blendFactor.z = Short4(0xFFFFu) - current.z;
880 			break;
881 		case BLEND_DEST:
882 			blendFactor.x = pixel.x;
883 			blendFactor.y = pixel.y;
884 			blendFactor.z = pixel.z;
885 			break;
886 		case BLEND_INVDEST:
887 			blendFactor.x = Short4(0xFFFFu) - pixel.x;
888 			blendFactor.y = Short4(0xFFFFu) - pixel.y;
889 			blendFactor.z = Short4(0xFFFFu) - pixel.z;
890 			break;
891 		case BLEND_SOURCEALPHA:
892 			blendFactor.x = current.w;
893 			blendFactor.y = current.w;
894 			blendFactor.z = current.w;
895 			break;
896 		case BLEND_INVSOURCEALPHA:
897 			blendFactor.x = Short4(0xFFFFu) - current.w;
898 			blendFactor.y = Short4(0xFFFFu) - current.w;
899 			blendFactor.z = Short4(0xFFFFu) - current.w;
900 			break;
901 		case BLEND_DESTALPHA:
902 			blendFactor.x = pixel.w;
903 			blendFactor.y = pixel.w;
904 			blendFactor.z = pixel.w;
905 			break;
906 		case BLEND_INVDESTALPHA:
907 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
908 			blendFactor.y = Short4(0xFFFFu) - pixel.w;
909 			blendFactor.z = Short4(0xFFFFu) - pixel.w;
910 			break;
911 		case BLEND_SRCALPHASAT:
912 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
913 			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
914 			blendFactor.y = blendFactor.x;
915 			blendFactor.z = blendFactor.x;
916 			break;
917 		case BLEND_CONSTANT:
918 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
919 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
920 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
921 			break;
922 		case BLEND_INVCONSTANT:
923 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
924 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
925 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
926 			break;
927 		case BLEND_CONSTANTALPHA:
928 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
929 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
930 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
931 			break;
932 		case BLEND_INVCONSTANTALPHA:
933 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
934 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
935 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
936 			break;
937 		default:
938 			ASSERT(false);
939 		}
940 	}
941 
blendFactorAlpha(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorAlphaActive)942 	void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
943 	{
944 		switch(blendFactorAlphaActive)
945 		{
946 		case BLEND_ZERO:
947 			// Optimized
948 			break;
949 		case BLEND_ONE:
950 			// Optimized
951 			break;
952 		case BLEND_SOURCE:
953 			blendFactor.w = current.w;
954 			break;
955 		case BLEND_INVSOURCE:
956 			blendFactor.w = Short4(0xFFFFu) - current.w;
957 			break;
958 		case BLEND_DEST:
959 			blendFactor.w = pixel.w;
960 			break;
961 		case BLEND_INVDEST:
962 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
963 			break;
964 		case BLEND_SOURCEALPHA:
965 			blendFactor.w = current.w;
966 			break;
967 		case BLEND_INVSOURCEALPHA:
968 			blendFactor.w = Short4(0xFFFFu) - current.w;
969 			break;
970 		case BLEND_DESTALPHA:
971 			blendFactor.w = pixel.w;
972 			break;
973 		case BLEND_INVDESTALPHA:
974 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
975 			break;
976 		case BLEND_SRCALPHASAT:
977 			blendFactor.w = Short4(0xFFFFu);
978 			break;
979 		case BLEND_CONSTANT:
980 		case BLEND_CONSTANTALPHA:
981 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
982 			break;
983 		case BLEND_INVCONSTANT:
984 		case BLEND_INVCONSTANTALPHA:
985 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
986 			break;
987 		default:
988 			ASSERT(false);
989 		}
990 	}
991 
isSRGB(int index) const992 	bool PixelRoutine::isSRGB(int index) const
993 	{
994 		return Surface::isSRGBformat(state.targetFormat[index]);
995 	}
996 
readPixel(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & pixel)997 	void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
998 	{
999 		Short4 c01;
1000 		Short4 c23;
1001 		Pointer<Byte> buffer;
1002 		Pointer<Byte> buffer2;
1003 
1004 		switch(state.targetFormat[index])
1005 		{
1006 		case FORMAT_R5G6B5:
1007 			buffer = cBuffer + 2 * x;
1008 			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1009 			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1010 
1011 			pixel.x = c01 & Short4(0xF800u);
1012 			pixel.y = (c01 & Short4(0x07E0u)) << 5;
1013 			pixel.z = (c01 & Short4(0x001Fu)) << 11;
1014 			pixel.w = Short4(0xFFFFu);
1015 			break;
1016 		case FORMAT_A8R8G8B8:
1017 			buffer = cBuffer + 4 * x;
1018 			c01 = *Pointer<Short4>(buffer);
1019 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1020 			c23 = *Pointer<Short4>(buffer);
1021 			pixel.z = c01;
1022 			pixel.y = c01;
1023 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1024 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1025 			pixel.x = pixel.z;
1026 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1027 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1028 			pixel.y = pixel.z;
1029 			pixel.w = pixel.x;
1030 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1031 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1032 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1033 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1034 			break;
1035 		case FORMAT_A8B8G8R8:
1036 		case FORMAT_SRGB8_A8:
1037 			buffer = cBuffer + 4 * x;
1038 			c01 = *Pointer<Short4>(buffer);
1039 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1040 			c23 = *Pointer<Short4>(buffer);
1041 			pixel.z = c01;
1042 			pixel.y = c01;
1043 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1044 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1045 			pixel.x = pixel.z;
1046 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1047 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1048 			pixel.y = pixel.z;
1049 			pixel.w = pixel.x;
1050 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1051 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1052 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1053 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1054 			break;
1055 		case FORMAT_A8:
1056 			buffer = cBuffer + 1 * x;
1057 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1058 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1059 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1060 			pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1061 			pixel.x = Short4(0x0000);
1062 			pixel.y = Short4(0x0000);
1063 			pixel.z = Short4(0x0000);
1064 			break;
1065 		case FORMAT_R8:
1066 			buffer = cBuffer + 1 * x;
1067 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1068 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1069 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1070 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1071 			pixel.y = Short4(0x0000);
1072 			pixel.z = Short4(0x0000);
1073 			pixel.w = Short4(0xFFFFu);
1074 			break;
1075 		case FORMAT_X8R8G8B8:
1076 			buffer = cBuffer + 4 * x;
1077 			c01 = *Pointer<Short4>(buffer);
1078 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1079 			c23 = *Pointer<Short4>(buffer);
1080 			pixel.z = c01;
1081 			pixel.y = c01;
1082 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1083 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1084 			pixel.x = pixel.z;
1085 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1086 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1087 			pixel.y = pixel.z;
1088 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1089 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1090 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1091 			pixel.w = Short4(0xFFFFu);
1092 			break;
1093 		case FORMAT_G8R8:
1094 			buffer = cBuffer + 2 * x;
1095 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1096 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1097 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1098 			pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1099 			pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1100 			pixel.z = Short4(0x0000u);
1101 			pixel.w = Short4(0xFFFFu);
1102 			break;
1103 		case FORMAT_X8B8G8R8:
1104 		case FORMAT_SRGB8_X8:
1105 			buffer = cBuffer + 4 * x;
1106 			c01 = *Pointer<Short4>(buffer);
1107 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1108 			c23 = *Pointer<Short4>(buffer);
1109 			pixel.z = c01;
1110 			pixel.y = c01;
1111 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1112 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1113 			pixel.x = pixel.z;
1114 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1115 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1116 			pixel.y = pixel.z;
1117 			pixel.w = pixel.x;
1118 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1119 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1120 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1121 			pixel.w = Short4(0xFFFFu);
1122 			break;
1123 		case FORMAT_A8G8R8B8Q:
1124 			UNIMPLEMENTED();
1125 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1126 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1127 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1128 		//	pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1129 			break;
1130 		case FORMAT_X8G8R8B8Q:
1131 			UNIMPLEMENTED();
1132 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1133 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1134 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1135 		//	pixel.w = Short4(0xFFFFu);
1136 			break;
1137 		case FORMAT_A16B16G16R16:
1138 			buffer = cBuffer;
1139 			pixel.x = *Pointer<Short4>(buffer + 8 * x);
1140 			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1141 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1142 			pixel.z = *Pointer<Short4>(buffer + 8 * x);
1143 			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1144 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1145 			break;
1146 		case FORMAT_G16R16:
1147 			buffer = cBuffer;
1148 			pixel.x = *Pointer<Short4>(buffer + 4 * x);
1149 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1150 			pixel.y = *Pointer<Short4>(buffer + 4 * x);
1151 			pixel.z = pixel.x;
1152 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1153 			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1154 			pixel.y = pixel.z;
1155 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1156 			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1157 			pixel.z = Short4(0xFFFFu);
1158 			pixel.w = Short4(0xFFFFu);
1159 			break;
1160 		default:
1161 			ASSERT(false);
1162 		}
1163 
1164 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1165 		{
1166 			sRGBtoLinear16_12_16(pixel);
1167 		}
1168 	}
1169 
alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1170 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1171 	{
1172 		if(!state.alphaBlendActive)
1173 		{
1174 			return;
1175 		}
1176 
1177 		Vector4s pixel;
1178 		readPixel(index, cBuffer, x, pixel);
1179 
1180 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1181 		Vector4s sourceFactor;
1182 		Vector4s destFactor;
1183 
1184 		blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1185 		blendFactor(destFactor, current, pixel, state.destBlendFactor);
1186 
1187 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1188 		{
1189 			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1190 			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1191 			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1192 		}
1193 
1194 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1195 		{
1196 			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1197 			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1198 			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1199 		}
1200 
1201 		switch(state.blendOperation)
1202 		{
1203 		case BLENDOP_ADD:
1204 			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1205 			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1206 			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1207 			break;
1208 		case BLENDOP_SUB:
1209 			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1210 			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1211 			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1212 			break;
1213 		case BLENDOP_INVSUB:
1214 			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1215 			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1216 			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1217 			break;
1218 		case BLENDOP_MIN:
1219 			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1220 			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1221 			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1222 			break;
1223 		case BLENDOP_MAX:
1224 			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1225 			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1226 			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1227 			break;
1228 		case BLENDOP_SOURCE:
1229 			// No operation
1230 			break;
1231 		case BLENDOP_DEST:
1232 			current.x = pixel.x;
1233 			current.y = pixel.y;
1234 			current.z = pixel.z;
1235 			break;
1236 		case BLENDOP_NULL:
1237 			current.x = Short4(0x0000);
1238 			current.y = Short4(0x0000);
1239 			current.z = Short4(0x0000);
1240 			break;
1241 		default:
1242 			ASSERT(false);
1243 		}
1244 
1245 		blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1246 		blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1247 
1248 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1249 		{
1250 			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1251 		}
1252 
1253 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1254 		{
1255 			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1256 		}
1257 
1258 		switch(state.blendOperationAlpha)
1259 		{
1260 		case BLENDOP_ADD:
1261 			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1262 			break;
1263 		case BLENDOP_SUB:
1264 			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1265 			break;
1266 		case BLENDOP_INVSUB:
1267 			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1268 			break;
1269 		case BLENDOP_MIN:
1270 			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1271 			break;
1272 		case BLENDOP_MAX:
1273 			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1274 			break;
1275 		case BLENDOP_SOURCE:
1276 			// No operation
1277 			break;
1278 		case BLENDOP_DEST:
1279 			current.w = pixel.w;
1280 			break;
1281 		case BLENDOP_NULL:
1282 			current.w = Short4(0x0000);
1283 			break;
1284 		default:
1285 			ASSERT(false);
1286 		}
1287 	}
1288 
logicOperation(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1289 	void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1290 	{
1291 		if(state.logicalOperation == LOGICALOP_COPY)
1292 		{
1293 			return;
1294 		}
1295 
1296 		Vector4s pixel;
1297 		readPixel(index, cBuffer, x, pixel);
1298 
1299 		switch(state.logicalOperation)
1300 		{
1301 		case LOGICALOP_CLEAR:
1302 			current.x = UShort4(0);
1303 			current.y = UShort4(0);
1304 			current.z = UShort4(0);
1305 			break;
1306 		case LOGICALOP_SET:
1307 			current.x = UShort4(0xFFFFu);
1308 			current.y = UShort4(0xFFFFu);
1309 			current.z = UShort4(0xFFFFu);
1310 			break;
1311 		case LOGICALOP_COPY:
1312 			ASSERT(false);   // Optimized out
1313 			break;
1314 		case LOGICALOP_COPY_INVERTED:
1315 			current.x = ~current.x;
1316 			current.y = ~current.y;
1317 			current.z = ~current.z;
1318 			break;
1319 		case LOGICALOP_NOOP:
1320 			current.x = pixel.x;
1321 			current.y = pixel.y;
1322 			current.z = pixel.z;
1323 			break;
1324 		case LOGICALOP_INVERT:
1325 			current.x = ~pixel.x;
1326 			current.y = ~pixel.y;
1327 			current.z = ~pixel.z;
1328 			break;
1329 		case LOGICALOP_AND:
1330 			current.x = pixel.x & current.x;
1331 			current.y = pixel.y & current.y;
1332 			current.z = pixel.z & current.z;
1333 			break;
1334 		case LOGICALOP_NAND:
1335 			current.x = ~(pixel.x & current.x);
1336 			current.y = ~(pixel.y & current.y);
1337 			current.z = ~(pixel.z & current.z);
1338 			break;
1339 		case LOGICALOP_OR:
1340 			current.x = pixel.x | current.x;
1341 			current.y = pixel.y | current.y;
1342 			current.z = pixel.z | current.z;
1343 			break;
1344 		case LOGICALOP_NOR:
1345 			current.x = ~(pixel.x | current.x);
1346 			current.y = ~(pixel.y | current.y);
1347 			current.z = ~(pixel.z | current.z);
1348 			break;
1349 		case LOGICALOP_XOR:
1350 			current.x = pixel.x ^ current.x;
1351 			current.y = pixel.y ^ current.y;
1352 			current.z = pixel.z ^ current.z;
1353 			break;
1354 		case LOGICALOP_EQUIV:
1355 			current.x = ~(pixel.x ^ current.x);
1356 			current.y = ~(pixel.y ^ current.y);
1357 			current.z = ~(pixel.z ^ current.z);
1358 			break;
1359 		case LOGICALOP_AND_REVERSE:
1360 			current.x = ~pixel.x & current.x;
1361 			current.y = ~pixel.y & current.y;
1362 			current.z = ~pixel.z & current.z;
1363 			break;
1364 		case LOGICALOP_AND_INVERTED:
1365 			current.x = pixel.x & ~current.x;
1366 			current.y = pixel.y & ~current.y;
1367 			current.z = pixel.z & ~current.z;
1368 			break;
1369 		case LOGICALOP_OR_REVERSE:
1370 			current.x = ~pixel.x | current.x;
1371 			current.y = ~pixel.y | current.y;
1372 			current.z = ~pixel.z | current.z;
1373 			break;
1374 		case LOGICALOP_OR_INVERTED:
1375 			current.x = pixel.x | ~current.x;
1376 			current.y = pixel.y | ~current.y;
1377 			current.z = pixel.z | ~current.z;
1378 			break;
1379 		default:
1380 			ASSERT(false);
1381 		}
1382 	}
1383 
writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & current,Int & sMask,Int & zMask,Int & cMask)1384 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1385 	{
1386 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1387 		{
1388 			linearToSRGB16_12_16(current);
1389 		}
1390 
1391 		if(exactColorRounding)
1392 		{
1393 			switch(state.targetFormat[index])
1394 			{
1395 			case FORMAT_R5G6B5:
1396 				current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1397 				current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1398 				current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1399 				break;
1400 			case FORMAT_X8G8R8B8Q:
1401 			case FORMAT_A8G8R8B8Q:
1402 			case FORMAT_X8R8G8B8:
1403 			case FORMAT_X8B8G8R8:
1404 			case FORMAT_A8R8G8B8:
1405 			case FORMAT_A8B8G8R8:
1406 			case FORMAT_SRGB8_X8:
1407 			case FORMAT_SRGB8_A8:
1408 			case FORMAT_G8R8:
1409 			case FORMAT_R8:
1410 				current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1411 				current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1412 				current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1413 				current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1414 				break;
1415 			default:
1416 				break;
1417 			}
1418 		}
1419 
1420 		int rgbaWriteMask = state.colorWriteActive(index);
1421 		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1422 
1423 		switch(state.targetFormat[index])
1424 		{
1425 		case FORMAT_R5G6B5:
1426 			{
1427 				current.x = current.x & Short4(0xF800u);
1428 				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1429 				current.z = As<UShort4>(current.z) >> 11;
1430 
1431 				current.x = current.x | current.y | current.z;
1432 			}
1433 			break;
1434 		case FORMAT_X8G8R8B8Q:
1435 			UNIMPLEMENTED();
1436 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1437 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1438 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1439 
1440 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1441 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1442 			break;
1443 		case FORMAT_A8G8R8B8Q:
1444 			UNIMPLEMENTED();
1445 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1446 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1447 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1448 		//	current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1449 
1450 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1451 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1452 			break;
1453 		case FORMAT_X8R8G8B8:
1454 		case FORMAT_A8R8G8B8:
1455 			if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1456 			{
1457 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1458 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1459 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1460 
1461 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
1462 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
1463 
1464 				current.x = current.z;
1465 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1466 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1467 				current.y = current.z;
1468 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1469 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1470 			}
1471 			else
1472 			{
1473 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1474 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1475 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1476 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1477 
1478 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
1479 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
1480 
1481 				current.x = current.z;
1482 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1483 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1484 				current.y = current.z;
1485 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1486 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1487 			}
1488 			break;
1489 		case FORMAT_X8B8G8R8:
1490 		case FORMAT_A8B8G8R8:
1491 		case FORMAT_SRGB8_X8:
1492 		case FORMAT_SRGB8_A8:
1493 			if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1494 			{
1495 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1496 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1497 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1498 
1499 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
1500 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
1501 
1502 				current.x = current.z;
1503 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1504 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1505 				current.y = current.z;
1506 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1507 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1508 			}
1509 			else
1510 			{
1511 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1512 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1513 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1514 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1515 
1516 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
1517 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
1518 
1519 				current.x = current.z;
1520 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1521 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1522 				current.y = current.z;
1523 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1524 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1525 			}
1526 			break;
1527 		case FORMAT_G8R8:
1528 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1529 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1530 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
1531 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
1532 			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1533 			break;
1534 		case FORMAT_R8:
1535 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1536 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
1537 			break;
1538 		case FORMAT_A8:
1539 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1540 			current.w = As<Short4>(PackUnsigned(current.w, current.w));
1541 			break;
1542 		case FORMAT_G16R16:
1543 			current.z = current.x;
1544 			current.x = As<Short4>(UnpackLow(current.x, current.y));
1545 			current.z = As<Short4>(UnpackHigh(current.z, current.y));
1546 			current.y = current.z;
1547 			break;
1548 		case FORMAT_A16B16G16R16:
1549 			transpose4x4(current.x, current.y, current.z, current.w);
1550 			break;
1551 		default:
1552 			ASSERT(false);
1553 		}
1554 
1555 		Short4 c01 = current.z;
1556 		Short4 c23 = current.y;
1557 
1558 		Int xMask;   // Combination of all masks
1559 
1560 		if(state.depthTestActive)
1561 		{
1562 			xMask = zMask;
1563 		}
1564 		else
1565 		{
1566 			xMask = cMask;
1567 		}
1568 
1569 		if(state.stencilActive)
1570 		{
1571 			xMask &= sMask;
1572 		}
1573 
1574 		switch(state.targetFormat[index])
1575 		{
1576 		case FORMAT_R5G6B5:
1577 			{
1578 				Pointer<Byte> buffer = cBuffer + 2 * x;
1579 				Int value = *Pointer<Int>(buffer);
1580 
1581 				Int c01 = Extract(As<Int2>(current.x), 0);
1582 
1583 				if((bgraWriteMask & 0x00000007) != 0x00000007)
1584 				{
1585 					Int masked = value;
1586 					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1587 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1588 					c01 |= masked;
1589 				}
1590 
1591 				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1592 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1593 				c01 |= value;
1594 				*Pointer<Int>(buffer) = c01;
1595 
1596 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1597 				value = *Pointer<Int>(buffer);
1598 
1599 				Int c23 = Extract(As<Int2>(current.x), 1);
1600 
1601 				if((bgraWriteMask & 0x00000007) != 0x00000007)
1602 				{
1603 					Int masked = value;
1604 					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1605 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1606 					c23 |= masked;
1607 				}
1608 
1609 				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1610 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1611 				c23 |= value;
1612 				*Pointer<Int>(buffer) = c23;
1613 			}
1614 			break;
1615 		case FORMAT_A8G8R8B8Q:
1616 		case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1617 			UNIMPLEMENTED();
1618 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1619 
1620 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1621 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1622 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1623 		//	{
1624 		//		Short4 masked = value;
1625 		//		c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1626 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1627 		//		c01 |= masked;
1628 		//	}
1629 
1630 		//	c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1631 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1632 		//	c01 |= value;
1633 		//	*Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1634 
1635 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1636 
1637 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1638 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1639 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1640 		//	{
1641 		//		Short4 masked = value;
1642 		//		c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1643 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1644 		//		c23 |= masked;
1645 		//	}
1646 
1647 		//	c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1648 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1649 		//	c23 |= value;
1650 		//	*Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1651 			break;
1652 		case FORMAT_A8R8G8B8:
1653 		case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1654 			{
1655 				Pointer<Byte> buffer = cBuffer + x * 4;
1656 				Short4 value = *Pointer<Short4>(buffer);
1657 
1658 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1659 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1660 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1661 				{
1662 					Short4 masked = value;
1663 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1664 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1665 					c01 |= masked;
1666 				}
1667 
1668 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1669 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1670 				c01 |= value;
1671 				*Pointer<Short4>(buffer) = c01;
1672 
1673 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1674 				value = *Pointer<Short4>(buffer);
1675 
1676 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1677 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1678 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1679 				{
1680 					Short4 masked = value;
1681 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1682 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1683 					c23 |= masked;
1684 				}
1685 
1686 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1687 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1688 				c23 |= value;
1689 				*Pointer<Short4>(buffer) = c23;
1690 			}
1691 			break;
1692 		case FORMAT_A8B8G8R8:
1693 		case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1694 		case FORMAT_SRGB8_X8:
1695 		case FORMAT_SRGB8_A8:
1696 			{
1697 				Pointer<Byte> buffer = cBuffer + x * 4;
1698 				Short4 value = *Pointer<Short4>(buffer);
1699 
1700 				bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1701 				              (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1702 				               ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1703 
1704 				if(masked)
1705 				{
1706 					Short4 masked = value;
1707 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1708 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1709 					c01 |= masked;
1710 				}
1711 
1712 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1713 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1714 				c01 |= value;
1715 				*Pointer<Short4>(buffer) = c01;
1716 
1717 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1718 				value = *Pointer<Short4>(buffer);
1719 
1720 				if(masked)
1721 				{
1722 					Short4 masked = value;
1723 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1724 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1725 					c23 |= masked;
1726 				}
1727 
1728 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1729 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1730 				c23 |= value;
1731 				*Pointer<Short4>(buffer) = c23;
1732 			}
1733 			break;
1734 		case FORMAT_G8R8:
1735 			if((rgbaWriteMask & 0x00000003) != 0x0)
1736 			{
1737 				Pointer<Byte> buffer = cBuffer + 2 * x;
1738 				Int2 value;
1739 				value = Insert(value, *Pointer<Int>(buffer), 0);
1740 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1741 				value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1742 
1743 				Int2 packedCol = As<Int2>(current.x);
1744 
1745 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1746 				if((rgbaWriteMask & 0x3) != 0x3)
1747 				{
1748 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1749 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1750 					mergedMask &= rgbaMask;
1751 				}
1752 
1753 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1754 
1755 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1756 				*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1757 			}
1758 			break;
1759 		case FORMAT_R8:
1760 			if(rgbaWriteMask & 0x00000001)
1761 			{
1762 				Pointer<Byte> buffer = cBuffer + 1 * x;
1763 				Short4 value;
1764 				value = Insert(value, *Pointer<Short>(buffer), 0);
1765 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1766 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1767 
1768 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1769 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1770 				current.x |= value;
1771 
1772 				*Pointer<Short>(buffer) = Extract(current.x, 0);
1773 				*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1774 			}
1775 			break;
1776 		case FORMAT_A8:
1777 			if(rgbaWriteMask & 0x00000008)
1778 			{
1779 				Pointer<Byte> buffer = cBuffer + 1 * x;
1780 				Short4 value;
1781 				value = Insert(value, *Pointer<Short>(buffer), 0);
1782 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1783 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1784 
1785 				current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1786 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1787 				current.w |= value;
1788 
1789 				*Pointer<Short>(buffer) = Extract(current.w, 0);
1790 				*Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1791 			}
1792 			break;
1793 		case FORMAT_G16R16:
1794 			{
1795 				Pointer<Byte> buffer = cBuffer + 4 * x;
1796 
1797 				Short4 value = *Pointer<Short4>(buffer);
1798 
1799 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1800 				{
1801 					Short4 masked = value;
1802 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1803 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1804 					current.x |= masked;
1805 				}
1806 
1807 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1808 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1809 				current.x |= value;
1810 				*Pointer<Short4>(buffer) = current.x;
1811 
1812 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1813 
1814 				value = *Pointer<Short4>(buffer);
1815 
1816 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1817 				{
1818 					Short4 masked = value;
1819 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1820 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1821 					current.y |= masked;
1822 				}
1823 
1824 				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1825 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1826 				current.y |= value;
1827 				*Pointer<Short4>(buffer) = current.y;
1828 			}
1829 			break;
1830 		case FORMAT_A16B16G16R16:
1831 			{
1832 				Pointer<Byte> buffer = cBuffer + 8 * x;
1833 
1834 				{
1835 					Short4 value = *Pointer<Short4>(buffer);
1836 
1837 					if(rgbaWriteMask != 0x0000000F)
1838 					{
1839 						Short4 masked = value;
1840 						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1841 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1842 						current.x |= masked;
1843 					}
1844 
1845 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1846 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1847 					current.x |= value;
1848 					*Pointer<Short4>(buffer) = current.x;
1849 				}
1850 
1851 				{
1852 					Short4 value = *Pointer<Short4>(buffer + 8);
1853 
1854 					if(rgbaWriteMask != 0x0000000F)
1855 					{
1856 						Short4 masked = value;
1857 						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1858 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1859 						current.y |= masked;
1860 					}
1861 
1862 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1863 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1864 					current.y |= value;
1865 					*Pointer<Short4>(buffer + 8) = current.y;
1866 				}
1867 
1868 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1869 
1870 				{
1871 					Short4 value = *Pointer<Short4>(buffer);
1872 
1873 					if(rgbaWriteMask != 0x0000000F)
1874 					{
1875 						Short4 masked = value;
1876 						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1877 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1878 						current.z |= masked;
1879 					}
1880 
1881 					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1882 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1883 					current.z |= value;
1884 					*Pointer<Short4>(buffer) = current.z;
1885 				}
1886 
1887 				{
1888 					Short4 value = *Pointer<Short4>(buffer + 8);
1889 
1890 					if(rgbaWriteMask != 0x0000000F)
1891 					{
1892 						Short4 masked = value;
1893 						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1894 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1895 						current.w |= masked;
1896 					}
1897 
1898 					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1899 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1900 					current.w |= value;
1901 					*Pointer<Short4>(buffer + 8) = current.w;
1902 				}
1903 			}
1904 			break;
1905 		default:
1906 			ASSERT(false);
1907 		}
1908 	}
1909 
blendFactor(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorActive)1910 	void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1911 	{
1912 		switch(blendFactorActive)
1913 		{
1914 		case BLEND_ZERO:
1915 			// Optimized
1916 			break;
1917 		case BLEND_ONE:
1918 			// Optimized
1919 			break;
1920 		case BLEND_SOURCE:
1921 			blendFactor.x = oC.x;
1922 			blendFactor.y = oC.y;
1923 			blendFactor.z = oC.z;
1924 			break;
1925 		case BLEND_INVSOURCE:
1926 			blendFactor.x = Float4(1.0f) - oC.x;
1927 			blendFactor.y = Float4(1.0f) - oC.y;
1928 			blendFactor.z = Float4(1.0f) - oC.z;
1929 			break;
1930 		case BLEND_DEST:
1931 			blendFactor.x = pixel.x;
1932 			blendFactor.y = pixel.y;
1933 			blendFactor.z = pixel.z;
1934 			break;
1935 		case BLEND_INVDEST:
1936 			blendFactor.x = Float4(1.0f) - pixel.x;
1937 			blendFactor.y = Float4(1.0f) - pixel.y;
1938 			blendFactor.z = Float4(1.0f) - pixel.z;
1939 			break;
1940 		case BLEND_SOURCEALPHA:
1941 			blendFactor.x = oC.w;
1942 			blendFactor.y = oC.w;
1943 			blendFactor.z = oC.w;
1944 			break;
1945 		case BLEND_INVSOURCEALPHA:
1946 			blendFactor.x = Float4(1.0f) - oC.w;
1947 			blendFactor.y = Float4(1.0f) - oC.w;
1948 			blendFactor.z = Float4(1.0f) - oC.w;
1949 			break;
1950 		case BLEND_DESTALPHA:
1951 			blendFactor.x = pixel.w;
1952 			blendFactor.y = pixel.w;
1953 			blendFactor.z = pixel.w;
1954 			break;
1955 		case BLEND_INVDESTALPHA:
1956 			blendFactor.x = Float4(1.0f) - pixel.w;
1957 			blendFactor.y = Float4(1.0f) - pixel.w;
1958 			blendFactor.z = Float4(1.0f) - pixel.w;
1959 			break;
1960 		case BLEND_SRCALPHASAT:
1961 			blendFactor.x = Float4(1.0f) - pixel.w;
1962 			blendFactor.x = Min(blendFactor.x, oC.w);
1963 			blendFactor.y = blendFactor.x;
1964 			blendFactor.z = blendFactor.x;
1965 			break;
1966 		case BLEND_CONSTANT:
1967 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1968 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1969 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1970 			break;
1971 		case BLEND_INVCONSTANT:
1972 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1973 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1974 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1975 			break;
1976 		default:
1977 			ASSERT(false);
1978 		}
1979 	}
1980 
blendFactorAlpha(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorAlphaActive)1981 	void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1982 	{
1983 		switch(blendFactorAlphaActive)
1984 		{
1985 		case BLEND_ZERO:
1986 			// Optimized
1987 			break;
1988 		case BLEND_ONE:
1989 			// Optimized
1990 			break;
1991 		case BLEND_SOURCE:
1992 			blendFactor.w = oC.w;
1993 			break;
1994 		case BLEND_INVSOURCE:
1995 			blendFactor.w = Float4(1.0f) - oC.w;
1996 			break;
1997 		case BLEND_DEST:
1998 			blendFactor.w = pixel.w;
1999 			break;
2000 		case BLEND_INVDEST:
2001 			blendFactor.w = Float4(1.0f) - pixel.w;
2002 			break;
2003 		case BLEND_SOURCEALPHA:
2004 			blendFactor.w = oC.w;
2005 			break;
2006 		case BLEND_INVSOURCEALPHA:
2007 			blendFactor.w = Float4(1.0f) - oC.w;
2008 			break;
2009 		case BLEND_DESTALPHA:
2010 			blendFactor.w = pixel.w;
2011 			break;
2012 		case BLEND_INVDESTALPHA:
2013 			blendFactor.w = Float4(1.0f) - pixel.w;
2014 			break;
2015 		case BLEND_SRCALPHASAT:
2016 			blendFactor.w = Float4(1.0f);
2017 			break;
2018 		case BLEND_CONSTANT:
2019 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
2020 			break;
2021 		case BLEND_INVCONSTANT:
2022 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
2023 			break;
2024 		default:
2025 			ASSERT(false);
2026 		}
2027 	}
2028 
alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4f & oC,Int & x)2029 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
2030 	{
2031 		if(!state.alphaBlendActive)
2032 		{
2033 			return;
2034 		}
2035 
2036 		Pointer<Byte> buffer;
2037 		Vector4f pixel;
2038 
2039 		Vector4s color;
2040 		Short4 c01;
2041 		Short4 c23;
2042 
2043 		Float4 one;
2044 		if(Surface::isFloatFormat(state.targetFormat[index]))
2045 		{
2046 			one = Float4(1.0f);
2047 		}
2048 		else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
2049 		{
2050 			one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
2051 		}
2052 
2053 		switch(state.targetFormat[index])
2054 		{
2055 		case FORMAT_R32I:
2056 		case FORMAT_R32UI:
2057 		case FORMAT_R32F:
2058 			buffer = cBuffer;
2059 			// FIXME: movlps
2060 			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2061 			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2062 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2063 			// FIXME: movhps
2064 			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2065 			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2066 			pixel.y = pixel.z = pixel.w = one;
2067 			break;
2068 		case FORMAT_G32R32I:
2069 		case FORMAT_G32R32UI:
2070 		case FORMAT_G32R32F:
2071 			buffer = cBuffer;
2072 			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2073 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2074 			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2075 			pixel.z = pixel.x;
2076 			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2077 			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2078 			pixel.y = pixel.z;
2079 			pixel.z = pixel.w = one;
2080 			break;
2081 		case FORMAT_X32B32G32R32F:
2082 		case FORMAT_A32B32G32R32F:
2083 		case FORMAT_X32B32G32R32F_UNSIGNED:
2084 		case FORMAT_A32B32G32R32I:
2085 		case FORMAT_A32B32G32R32UI:
2086 			buffer = cBuffer;
2087 			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2088 			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2089 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2090 			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2091 			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2092 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2093 			if(state.targetFormat[index] == FORMAT_X32B32G32R32F ||
2094 			   state.targetFormat[index] == FORMAT_X32B32G32R32F_UNSIGNED)
2095 			{
2096 				pixel.w = Float4(1.0f);
2097 			}
2098 			break;
2099 		default:
2100 			ASSERT(false);
2101 		}
2102 
2103 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2104 		{
2105 			sRGBtoLinear(pixel.x);
2106 			sRGBtoLinear(pixel.y);
2107 			sRGBtoLinear(pixel.z);
2108 		}
2109 
2110 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2111 		Vector4f sourceFactor;
2112 		Vector4f destFactor;
2113 
2114 		blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2115 		blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2116 
2117 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2118 		{
2119 			oC.x *= sourceFactor.x;
2120 			oC.y *= sourceFactor.y;
2121 			oC.z *= sourceFactor.z;
2122 		}
2123 
2124 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2125 		{
2126 			pixel.x *= destFactor.x;
2127 			pixel.y *= destFactor.y;
2128 			pixel.z *= destFactor.z;
2129 		}
2130 
2131 		switch(state.blendOperation)
2132 		{
2133 		case BLENDOP_ADD:
2134 			oC.x += pixel.x;
2135 			oC.y += pixel.y;
2136 			oC.z += pixel.z;
2137 			break;
2138 		case BLENDOP_SUB:
2139 			oC.x -= pixel.x;
2140 			oC.y -= pixel.y;
2141 			oC.z -= pixel.z;
2142 			break;
2143 		case BLENDOP_INVSUB:
2144 			oC.x = pixel.x - oC.x;
2145 			oC.y = pixel.y - oC.y;
2146 			oC.z = pixel.z - oC.z;
2147 			break;
2148 		case BLENDOP_MIN:
2149 			oC.x = Min(oC.x, pixel.x);
2150 			oC.y = Min(oC.y, pixel.y);
2151 			oC.z = Min(oC.z, pixel.z);
2152 			break;
2153 		case BLENDOP_MAX:
2154 			oC.x = Max(oC.x, pixel.x);
2155 			oC.y = Max(oC.y, pixel.y);
2156 			oC.z = Max(oC.z, pixel.z);
2157 			break;
2158 		case BLENDOP_SOURCE:
2159 			// No operation
2160 			break;
2161 		case BLENDOP_DEST:
2162 			oC.x = pixel.x;
2163 			oC.y = pixel.y;
2164 			oC.z = pixel.z;
2165 			break;
2166 		case BLENDOP_NULL:
2167 			oC.x = Float4(0.0f);
2168 			oC.y = Float4(0.0f);
2169 			oC.z = Float4(0.0f);
2170 			break;
2171 		default:
2172 			ASSERT(false);
2173 		}
2174 
2175 		blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2176 		blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2177 
2178 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2179 		{
2180 			oC.w *= sourceFactor.w;
2181 		}
2182 
2183 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2184 		{
2185 			pixel.w *= destFactor.w;
2186 		}
2187 
2188 		switch(state.blendOperationAlpha)
2189 		{
2190 		case BLENDOP_ADD:
2191 			oC.w += pixel.w;
2192 			break;
2193 		case BLENDOP_SUB:
2194 			oC.w -= pixel.w;
2195 			break;
2196 		case BLENDOP_INVSUB:
2197 			pixel.w -= oC.w;
2198 			oC.w = pixel.w;
2199 			break;
2200 		case BLENDOP_MIN:
2201 			oC.w = Min(oC.w, pixel.w);
2202 			break;
2203 		case BLENDOP_MAX:
2204 			oC.w = Max(oC.w, pixel.w);
2205 			break;
2206 		case BLENDOP_SOURCE:
2207 			// No operation
2208 			break;
2209 		case BLENDOP_DEST:
2210 			oC.w = pixel.w;
2211 			break;
2212 		case BLENDOP_NULL:
2213 			oC.w = Float4(0.0f);
2214 			break;
2215 		default:
2216 			ASSERT(false);
2217 		}
2218 	}
2219 
writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4f & oC,Int & sMask,Int & zMask,Int & cMask)2220 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2221 	{
2222 		switch(state.targetFormat[index])
2223 		{
2224 		case FORMAT_R32F:
2225 		case FORMAT_R32I:
2226 		case FORMAT_R32UI:
2227 		case FORMAT_R16I:
2228 		case FORMAT_R16UI:
2229 		case FORMAT_R8I:
2230 		case FORMAT_R8UI:
2231 			break;
2232 		case FORMAT_G32R32F:
2233 		case FORMAT_G32R32I:
2234 		case FORMAT_G32R32UI:
2235 		case FORMAT_G16R16I:
2236 		case FORMAT_G16R16UI:
2237 		case FORMAT_G8R8I:
2238 		case FORMAT_G8R8UI:
2239 			oC.z = oC.x;
2240 			oC.x = UnpackLow(oC.x, oC.y);
2241 			oC.z = UnpackHigh(oC.z, oC.y);
2242 			oC.y = oC.z;
2243 			break;
2244 		case FORMAT_X32B32G32R32F:
2245 		case FORMAT_A32B32G32R32F:
2246 		case FORMAT_X32B32G32R32F_UNSIGNED:
2247 		case FORMAT_A32B32G32R32I:
2248 		case FORMAT_A32B32G32R32UI:
2249 		case FORMAT_A16B16G16R16I:
2250 		case FORMAT_A16B16G16R16UI:
2251 		case FORMAT_A8B8G8R8I:
2252 		case FORMAT_A8B8G8R8UI:
2253 			transpose4x4(oC.x, oC.y, oC.z, oC.w);
2254 			break;
2255 		default:
2256 			ASSERT(false);
2257 		}
2258 
2259 		int rgbaWriteMask = state.colorWriteActive(index);
2260 
2261 		Int xMask;   // Combination of all masks
2262 
2263 		if(state.depthTestActive)
2264 		{
2265 			xMask = zMask;
2266 		}
2267 		else
2268 		{
2269 			xMask = cMask;
2270 		}
2271 
2272 		if(state.stencilActive)
2273 		{
2274 			xMask &= sMask;
2275 		}
2276 
2277 		Pointer<Byte> buffer;
2278 		Float4 value;
2279 
2280 		switch(state.targetFormat[index])
2281 		{
2282 		case FORMAT_R32F:
2283 		case FORMAT_R32I:
2284 		case FORMAT_R32UI:
2285 			if(rgbaWriteMask & 0x00000001)
2286 			{
2287 				buffer = cBuffer + 4 * x;
2288 
2289 				// FIXME: movlps
2290 				value.x = *Pointer<Float>(buffer + 0);
2291 				value.y = *Pointer<Float>(buffer + 4);
2292 
2293 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2294 
2295 				// FIXME: movhps
2296 				value.z = *Pointer<Float>(buffer + 0);
2297 				value.w = *Pointer<Float>(buffer + 4);
2298 
2299 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2300 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2301 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2302 
2303 				// FIXME: movhps
2304 				*Pointer<Float>(buffer + 0) = oC.x.z;
2305 				*Pointer<Float>(buffer + 4) = oC.x.w;
2306 
2307 				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2308 
2309 				// FIXME: movlps
2310 				*Pointer<Float>(buffer + 0) = oC.x.x;
2311 				*Pointer<Float>(buffer + 4) = oC.x.y;
2312 			}
2313 			break;
2314 		case FORMAT_R16I:
2315 		case FORMAT_R16UI:
2316 			if(rgbaWriteMask & 0x00000001)
2317 			{
2318 				buffer = cBuffer + 2 * x;
2319 
2320 				UShort4 xyzw;
2321 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2322 
2323 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2324 
2325 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2326 				value = As<Float4>(Int4(xyzw));
2327 
2328 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2329 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2330 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2331 
2332 				if(state.targetFormat[index] == FORMAT_R16I)
2333 				{
2334 					Float component = oC.x.z;
2335 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2336 					component = oC.x.w;
2337 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2338 
2339 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2340 
2341 					component = oC.x.x;
2342 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2343 					component = oC.x.y;
2344 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2345 				}
2346 				else // FORMAT_R16UI
2347 				{
2348 					Float component = oC.x.z;
2349 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2350 					component = oC.x.w;
2351 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2352 
2353 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2354 
2355 					component = oC.x.x;
2356 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2357 					component = oC.x.y;
2358 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2359 				}
2360 			}
2361 			break;
2362 		case FORMAT_R8I:
2363 		case FORMAT_R8UI:
2364 			if(rgbaWriteMask & 0x00000001)
2365 			{
2366 				buffer = cBuffer + x;
2367 
2368 				UInt xyzw, packedCol;
2369 
2370 				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2371 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2372 				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2373 
2374 				Short4 tmpCol = Short4(As<Int4>(oC.x));
2375 				if(state.targetFormat[index] == FORMAT_R8I)
2376 				{
2377 					tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2378 				}
2379 				else
2380 				{
2381 					tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2382 				}
2383 				packedCol = Extract(As<Int2>(tmpCol), 0);
2384 
2385 				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2386 				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2387 
2388 				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2389 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2390 				*Pointer<UShort>(buffer) = UShort(packedCol);
2391 			}
2392 			break;
2393 		case FORMAT_G32R32F:
2394 		case FORMAT_G32R32I:
2395 		case FORMAT_G32R32UI:
2396 			buffer = cBuffer + 8 * x;
2397 
2398 			value = *Pointer<Float4>(buffer);
2399 
2400 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2401 			{
2402 				Float4 masked = value;
2403 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2404 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2405 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2406 			}
2407 
2408 			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2409 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2410 			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2411 			*Pointer<Float4>(buffer) = oC.x;
2412 
2413 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2414 
2415 			value = *Pointer<Float4>(buffer);
2416 
2417 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2418 			{
2419 				Float4 masked;
2420 
2421 				masked = value;
2422 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2423 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2424 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2425 			}
2426 
2427 			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2428 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2429 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2430 			*Pointer<Float4>(buffer) = oC.y;
2431 			break;
2432 		case FORMAT_G16R16I:
2433 		case FORMAT_G16R16UI:
2434 			if((rgbaWriteMask & 0x00000003) != 0x0)
2435 			{
2436 				buffer = cBuffer + 4 * x;
2437 
2438 				UInt2 rgbaMask;
2439 				UShort4 packedCol = UShort4(As<Int4>(oC.x));
2440 				UShort4 value = *Pointer<UShort4>(buffer);
2441 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2442 				if((rgbaWriteMask & 0x3) != 0x3)
2443 				{
2444 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2445 					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2446 					mergedMask &= rgbaMask;
2447 				}
2448 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2449 
2450 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2451 
2452 				packedCol = UShort4(As<Int4>(oC.y));
2453 				value = *Pointer<UShort4>(buffer);
2454 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2455 				if((rgbaWriteMask & 0x3) != 0x3)
2456 				{
2457 					mergedMask &= rgbaMask;
2458 				}
2459 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2460 			}
2461 			break;
2462 		case FORMAT_G8R8I:
2463 		case FORMAT_G8R8UI:
2464 			if((rgbaWriteMask & 0x00000003) != 0x0)
2465 			{
2466 				buffer = cBuffer + 2 * x;
2467 
2468 				Int2 xyzw, packedCol;
2469 
2470 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2471 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2472 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2473 
2474 				if(state.targetFormat[index] == FORMAT_G8R8I)
2475 				{
2476 					packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2477 				}
2478 				else
2479 				{
2480 					packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2481 				}
2482 
2483 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2484 				if((rgbaWriteMask & 0x3) != 0x3)
2485 				{
2486 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2487 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2488 					mergedMask &= rgbaMask;
2489 				}
2490 
2491 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2492 
2493 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2494 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2495 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2496 			}
2497 			break;
2498 		case FORMAT_X32B32G32R32F:
2499 		case FORMAT_A32B32G32R32F:
2500 		case FORMAT_X32B32G32R32F_UNSIGNED:
2501 		case FORMAT_A32B32G32R32I:
2502 		case FORMAT_A32B32G32R32UI:
2503 			buffer = cBuffer + 16 * x;
2504 
2505 			{
2506 				value = *Pointer<Float4>(buffer, 16);
2507 
2508 				if(rgbaWriteMask != 0x0000000F)
2509 				{
2510 					Float4 masked = value;
2511 					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2512 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2513 					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2514 				}
2515 
2516 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2517 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2518 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2519 				*Pointer<Float4>(buffer, 16) = oC.x;
2520 			}
2521 
2522 			{
2523 				value = *Pointer<Float4>(buffer + 16, 16);
2524 
2525 				if(rgbaWriteMask != 0x0000000F)
2526 				{
2527 					Float4 masked = value;
2528 					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2529 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2530 					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2531 				}
2532 
2533 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2534 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2535 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2536 				*Pointer<Float4>(buffer + 16, 16) = oC.y;
2537 			}
2538 
2539 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2540 
2541 			{
2542 				value = *Pointer<Float4>(buffer, 16);
2543 
2544 				if(rgbaWriteMask != 0x0000000F)
2545 				{
2546 					Float4 masked = value;
2547 					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2548 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2549 					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2550 				}
2551 
2552 				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2553 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2554 				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2555 				*Pointer<Float4>(buffer, 16) = oC.z;
2556 			}
2557 
2558 			{
2559 				value = *Pointer<Float4>(buffer + 16, 16);
2560 
2561 				if(rgbaWriteMask != 0x0000000F)
2562 				{
2563 					Float4 masked = value;
2564 					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2565 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2566 					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2567 				}
2568 
2569 				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2570 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2571 				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2572 				*Pointer<Float4>(buffer + 16, 16) = oC.w;
2573 			}
2574 			break;
2575 		case FORMAT_A16B16G16R16I:
2576 		case FORMAT_A16B16G16R16UI:
2577 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2578 			{
2579 				buffer = cBuffer + 8 * x;
2580 
2581 				UInt4 rgbaMask;
2582 				UShort8 value = *Pointer<UShort8>(buffer);
2583 				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2584 				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2585 				if((rgbaWriteMask & 0xF) != 0xF)
2586 				{
2587 					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2588 					rgbaMask = UInt4(tmpMask, tmpMask);
2589 					mergedMask &= rgbaMask;
2590 				}
2591 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2592 
2593 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2594 
2595 				value = *Pointer<UShort8>(buffer);
2596 				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2597 				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2598 				if((rgbaWriteMask & 0xF) != 0xF)
2599 				{
2600 					mergedMask &= rgbaMask;
2601 				}
2602 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2603 			}
2604 			break;
2605 		case FORMAT_A8B8G8R8I:
2606 		case FORMAT_A8B8G8R8UI:
2607 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2608 			{
2609 				UInt2 value, packedCol, mergedMask;
2610 
2611 				buffer = cBuffer + 4 * x;
2612 
2613 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2614 				{
2615 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2616 				}
2617 				else
2618 				{
2619 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2620 				}
2621 				value = *Pointer<UInt2>(buffer, 16);
2622 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2623 				if(rgbaWriteMask != 0xF)
2624 				{
2625 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2626 				}
2627 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2628 
2629 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2630 
2631 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2632 				{
2633 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2634 				}
2635 				else
2636 				{
2637 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2638 				}
2639 				value = *Pointer<UInt2>(buffer, 16);
2640 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2641 				if(rgbaWriteMask != 0xF)
2642 				{
2643 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2644 				}
2645 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2646 			}
2647 			break;
2648 		default:
2649 			ASSERT(false);
2650 		}
2651 	}
2652 
convertFixed16(Float4 & cf,bool saturate)2653 	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2654 	{
2655 		return UShort4(cf * Float4(0xFFFF), saturate);
2656 	}
2657 
sRGBtoLinear16_12_16(Vector4s & c)2658 	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2659 	{
2660 		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2661 
2662 		c.x = As<UShort4>(c.x) >> 4;
2663 		c.y = As<UShort4>(c.y) >> 4;
2664 		c.z = As<UShort4>(c.z) >> 4;
2665 
2666 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2667 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2668 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2669 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2670 
2671 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2672 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2673 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2674 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2675 
2676 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2677 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2678 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2679 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2680 	}
2681 
linearToSRGB16_12_16(Vector4s & c)2682 	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2683 	{
2684 		c.x = As<UShort4>(c.x) >> 4;
2685 		c.y = As<UShort4>(c.y) >> 4;
2686 		c.z = As<UShort4>(c.z) >> 4;
2687 
2688 		linearToSRGB12_16(c);
2689 	}
2690 
linearToSRGB12_16(Vector4s & c)2691 	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2692 	{
2693 		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2694 
2695 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2696 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2697 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2698 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2699 
2700 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2701 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2702 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2703 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2704 
2705 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2706 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2707 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2708 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2709 	}
2710 
sRGBtoLinear(const Float4 & x)2711 	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2712 	{
2713 		Float4 linear = x * x;
2714 		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2715 
2716 		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2717 	}
2718 
colorUsed()2719 	bool PixelRoutine::colorUsed()
2720 	{
2721 		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2722 	}
2723 }
2724