1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelRoutine.hpp"
16 
17 #include "Renderer.hpp"
18 #include "QuadRasterizer.hpp"
19 #include "Surface.hpp"
20 #include "Primitive.hpp"
21 #include "CPUID.hpp"
22 #include "SamplerCore.hpp"
23 #include "Constants.hpp"
24 #include "Debug.hpp"
25 
26 namespace sw
27 {
28 	extern bool complementaryDepthBuffer;
29 	extern bool postBlendSRGB;
30 	extern bool exactColorRounding;
31 	extern bool forceClearRegisters;
32 
PixelRoutine(const PixelProcessor::State & state,const PixelShader * shader)33 	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
34 	{
35 		if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
36 		{
37 			for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
38 			{
39 				v[i].x = Float4(0.0f);
40 				v[i].y = Float4(0.0f);
41 				v[i].z = Float4(0.0f);
42 				v[i].w = Float4(0.0f);
43 			}
44 		}
45 	}
46 
~PixelRoutine()47 	PixelRoutine::~PixelRoutine()
48 	{
49 		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
50 		{
51 			delete sampler[i];
52 		}
53 	}
54 
quad(Pointer<Byte> cBuffer[RENDERTARGETS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)55 	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
56 	{
57 		#if PERF_PROFILE
58 			Long pipeTime = Ticks();
59 		#endif
60 
61 		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
62 		{
63 			sampler[i] = new SamplerCore(constants, state.sampler[i]);
64 		}
65 
66 		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
67 
68 		Int zMask[4];   // Depth mask
69 		Int sMask[4];   // Stencil mask
70 
71 		for(unsigned int q = 0; q < state.multiSample; q++)
72 		{
73 			zMask[q] = cMask[q];
74 			sMask[q] = cMask[q];
75 		}
76 
77 		for(unsigned int q = 0; q < state.multiSample; q++)
78 		{
79 			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
80 		}
81 
82 		Float4 f;
83 		Float4 rhwCentroid;
84 
85 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
86 
87 		if(interpolateZ())
88 		{
89 			for(unsigned int q = 0; q < state.multiSample; q++)
90 			{
91 				Float4 x = xxxx;
92 
93 				if(state.multiSample > 1)
94 				{
95 					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
96 				}
97 
98 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
99 			}
100 		}
101 
102 		Bool depthPass = false;
103 
104 		if(earlyDepthTest)
105 		{
106 			for(unsigned int q = 0; q < state.multiSample; q++)
107 			{
108 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
109 			}
110 		}
111 
112 		If(depthPass || Bool(!earlyDepthTest))
113 		{
114 			#if PERF_PROFILE
115 				Long interpTime = Ticks();
116 			#endif
117 
118 			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
119 
120 			// Centroid locations
121 			Float4 XXXX = Float4(0.0f);
122 			Float4 YYYY = Float4(0.0f);
123 
124 			if(state.centroid)
125 			{
126 				Float4 WWWW(1.0e-9f);
127 
128 				for(unsigned int q = 0; q < state.multiSample; q++)
129 				{
130 					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
131 					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
132 					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
133 				}
134 
135 				WWWW = Rcp_pp(WWWW);
136 				XXXX *= WWWW;
137 				YYYY *= WWWW;
138 
139 				XXXX += xxxx;
140 				YYYY += yyyy;
141 			}
142 
143 			if(interpolateW())
144 			{
145 				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
146 				rhw = reciprocal(w, false, false, true);
147 
148 				if(state.centroid)
149 				{
150 					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
151 				}
152 			}
153 
154 			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
155 			{
156 				for(int component = 0; component < 4; component++)
157 				{
158 					if(state.interpolant[interpolant].component & (1 << component))
159 					{
160 						if(!state.interpolant[interpolant].centroid)
161 						{
162 							v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
163 						}
164 						else
165 						{
166 							v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
167 						}
168 					}
169 				}
170 
171 				Float4 rcp;
172 
173 				switch(state.interpolant[interpolant].project)
174 				{
175 				case 0:
176 					break;
177 				case 1:
178 					rcp = reciprocal(v[interpolant].y);
179 					v[interpolant].x = v[interpolant].x * rcp;
180 					break;
181 				case 2:
182 					rcp = reciprocal(v[interpolant].z);
183 					v[interpolant].x = v[interpolant].x * rcp;
184 					v[interpolant].y = v[interpolant].y * rcp;
185 					break;
186 				case 3:
187 					rcp = reciprocal(v[interpolant].w);
188 					v[interpolant].x = v[interpolant].x * rcp;
189 					v[interpolant].y = v[interpolant].y * rcp;
190 					v[interpolant].z = v[interpolant].z * rcp;
191 					break;
192 				}
193 			}
194 
195 			if(state.fog.component)
196 			{
197 				f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
198 			}
199 
200 			setBuiltins(x, y, z, w);
201 
202 			#if PERF_PROFILE
203 				cycles[PERF_INTERP] += Ticks() - interpTime;
204 			#endif
205 
206 			Bool alphaPass = true;
207 
208 			if(colorUsed())
209 			{
210 				#if PERF_PROFILE
211 					Long shaderTime = Ticks();
212 				#endif
213 
214 				applyShader(cMask);
215 
216 				#if PERF_PROFILE
217 					cycles[PERF_SHADER] += Ticks() - shaderTime;
218 				#endif
219 
220 				alphaPass = alphaTest(cMask);
221 
222 				if((shader && shader->containsKill()) || state.alphaTestActive())
223 				{
224 					for(unsigned int q = 0; q < state.multiSample; q++)
225 					{
226 						zMask[q] &= cMask[q];
227 						sMask[q] &= cMask[q];
228 					}
229 				}
230 			}
231 
232 			If(alphaPass)
233 			{
234 				if(!earlyDepthTest)
235 				{
236 					for(unsigned int q = 0; q < state.multiSample; q++)
237 					{
238 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
239 					}
240 				}
241 
242 				#if PERF_PROFILE
243 					Long ropTime = Ticks();
244 				#endif
245 
246 				If(depthPass || Bool(earlyDepthTest))
247 				{
248 					for(unsigned int q = 0; q < state.multiSample; q++)
249 					{
250 						if(state.multiSampleMask & (1 << q))
251 						{
252 							writeDepth(zBuffer, q, x, z[q], zMask[q]);
253 
254 							if(state.occlusionEnabled)
255 							{
256 								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
257 							}
258 						}
259 					}
260 
261 					if(colorUsed())
262 					{
263 						#if PERF_PROFILE
264 							AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
265 						#endif
266 
267 						rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
268 					}
269 				}
270 
271 				#if PERF_PROFILE
272 					cycles[PERF_ROP] += Ticks() - ropTime;
273 				#endif
274 			}
275 		}
276 
277 		for(unsigned int q = 0; q < state.multiSample; q++)
278 		{
279 			if(state.multiSampleMask & (1 << q))
280 			{
281 				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
282 			}
283 		}
284 
285 		#if PERF_PROFILE
286 			cycles[PERF_PIPE] += Ticks() - pipeTime;
287 		#endif
288 	}
289 
interpolateCentroid(Float4 & x,Float4 & y,Float4 & rhw,Pointer<Byte> planeEquation,bool flat,bool perspective)290 	Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
291 	{
292 		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
293 
294 		if(!flat)
295 		{
296 			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
297 			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
298 
299 			if(perspective)
300 			{
301 				interpolant *= rhw;
302 			}
303 		}
304 
305 		return interpolant;
306 	}
307 
stencilTest(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & cMask)308 	void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
309 	{
310 		if(!state.stencilActive)
311 		{
312 			return;
313 		}
314 
315 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
316 
317 		Pointer<Byte> buffer = sBuffer + 2 * x;
318 
319 		if(q > 0)
320 		{
321 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
322 		}
323 
324 		Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
325 		Byte8 valueCCW = value;
326 
327 		if(!state.noStencilMask)
328 		{
329 			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
330 		}
331 
332 		stencilTest(value, state.stencilCompareMode, false);
333 
334 		if(state.twoSidedStencil)
335 		{
336 			if(!state.noStencilMaskCCW)
337 			{
338 				valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
339 			}
340 
341 			stencilTest(valueCCW, state.stencilCompareModeCCW, true);
342 
343 			value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
344 			valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
345 			value |= valueCCW;
346 		}
347 
348 		sMask = SignMask(value) & cMask;
349 	}
350 
stencilTest(Byte8 & value,StencilCompareMode stencilCompareMode,bool CCW)351 	void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
352 	{
353 		Byte8 equal;
354 
355 		switch(stencilCompareMode)
356 		{
357 		case STENCIL_ALWAYS:
358 			value = Byte8(0xFFFFFFFFFFFFFFFF);
359 			break;
360 		case STENCIL_NEVER:
361 			value = Byte8(0x0000000000000000);
362 			break;
363 		case STENCIL_LESS:			// a < b ~ b > a
364 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
365 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
366 			break;
367 		case STENCIL_EQUAL:
368 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
369 			break;
370 		case STENCIL_NOTEQUAL:		// a != b ~ !(a == b)
371 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
372 			value ^= Byte8(0xFFFFFFFFFFFFFFFF);
373 			break;
374 		case STENCIL_LESSEQUAL:	// a <= b ~ (b > a) || (a == b)
375 			equal = value;
376 			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
377 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
378 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
379 			value |= equal;
380 			break;
381 		case STENCIL_GREATER:		// a > b
382 			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
383 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
384 			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
385 			value = equal;
386 			break;
387 		case STENCIL_GREATEREQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
388 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
389 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
390 			value ^= Byte8(0xFFFFFFFFFFFFFFFF);
391 			break;
392 		default:
393 			ASSERT(false);
394 		}
395 	}
396 
depthTest(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & sMask,Int & zMask,Int & cMask)397 	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
398 	{
399 		if(!state.depthTestActive)
400 		{
401 			return true;
402 		}
403 
404 		Float4 Z = z;
405 
406 		if(shader && shader->depthOverride())
407 		{
408 			if(complementaryDepthBuffer)
409 			{
410 				Z = Float4(1.0f) - oDepth;
411 			}
412 			else
413 			{
414 				Z = oDepth;
415 			}
416 		}
417 
418 		Pointer<Byte> buffer;
419 		Int pitch;
420 
421 		if(!state.quadLayoutDepthBuffer)
422 		{
423 			buffer = zBuffer + 4 * x;
424 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
425 		}
426 		else
427 		{
428 			buffer = zBuffer + 8 * x;
429 		}
430 
431 		if(q > 0)
432 		{
433 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
434 		}
435 
436 		Float4 zValue;
437 
438 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
439 		{
440 			if(!state.quadLayoutDepthBuffer)
441 			{
442 				// FIXME: Properly optimizes?
443 				zValue.xy = *Pointer<Float4>(buffer);
444 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
445 			}
446 			else
447 			{
448 				zValue = *Pointer<Float4>(buffer, 16);
449 			}
450 		}
451 
452 		Int4 zTest;
453 
454 		switch(state.depthCompareMode)
455 		{
456 		case DEPTH_ALWAYS:
457 			// Optimized
458 			break;
459 		case DEPTH_NEVER:
460 			// Optimized
461 			break;
462 		case DEPTH_EQUAL:
463 			zTest = CmpEQ(zValue, Z);
464 			break;
465 		case DEPTH_NOTEQUAL:
466 			zTest = CmpNEQ(zValue, Z);
467 			break;
468 		case DEPTH_LESS:
469 			if(complementaryDepthBuffer)
470 			{
471 				zTest = CmpLT(zValue, Z);
472 			}
473 			else
474 			{
475 				zTest = CmpNLE(zValue, Z);
476 			}
477 			break;
478 		case DEPTH_GREATEREQUAL:
479 			if(complementaryDepthBuffer)
480 			{
481 				zTest = CmpNLT(zValue, Z);
482 			}
483 			else
484 			{
485 				zTest = CmpLE(zValue, Z);
486 			}
487 			break;
488 		case DEPTH_LESSEQUAL:
489 			if(complementaryDepthBuffer)
490 			{
491 				zTest = CmpLE(zValue, Z);
492 			}
493 			else
494 			{
495 				zTest = CmpNLT(zValue, Z);
496 			}
497 			break;
498 		case DEPTH_GREATER:
499 			if(complementaryDepthBuffer)
500 			{
501 				zTest = CmpNLE(zValue, Z);
502 			}
503 			else
504 			{
505 				zTest = CmpLT(zValue, Z);
506 			}
507 			break;
508 		default:
509 			ASSERT(false);
510 		}
511 
512 		switch(state.depthCompareMode)
513 		{
514 		case DEPTH_ALWAYS:
515 			zMask = cMask;
516 			break;
517 		case DEPTH_NEVER:
518 			zMask = 0x0;
519 			break;
520 		default:
521 			zMask = SignMask(zTest) & cMask;
522 			break;
523 		}
524 
525 		if(state.stencilActive)
526 		{
527 			zMask &= sMask;
528 		}
529 
530 		return zMask != 0;
531 	}
532 
alphaTest(Int & aMask,Short4 & alpha)533 	void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
534 	{
535 		Short4 cmp;
536 		Short4 equal;
537 
538 		switch(state.alphaCompareMode)
539 		{
540 		case ALPHA_ALWAYS:
541 			aMask = 0xF;
542 			break;
543 		case ALPHA_NEVER:
544 			aMask = 0x0;
545 			break;
546 		case ALPHA_EQUAL:
547 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
548 			aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
549 			break;
550 		case ALPHA_NOTEQUAL:		// a != b ~ !(a == b)
551 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
552 			aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
553 			break;
554 		case ALPHA_LESS:			// a < b ~ b > a
555 			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
556 			aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
557 			break;
558 		case ALPHA_GREATEREQUAL:	// a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
559 			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
560 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
561 			cmp |= equal;
562 			aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
563 			break;
564 		case ALPHA_LESSEQUAL:		// a <= b ~ !(a > b)
565 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
566 			aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
567 			break;
568 		case ALPHA_GREATER:			// a > b
569 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
570 			aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000)));
571 			break;
572 		default:
573 			ASSERT(false);
574 		}
575 	}
576 
alphaToCoverage(Int cMask[4],Float4 & alpha)577 	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
578 	{
579 		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
580 		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
581 		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
582 		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
583 
584 		Int aMask0 = SignMask(coverage0);
585 		Int aMask1 = SignMask(coverage1);
586 		Int aMask2 = SignMask(coverage2);
587 		Int aMask3 = SignMask(coverage3);
588 
589 		cMask[0] &= aMask0;
590 		cMask[1] &= aMask1;
591 		cMask[2] &= aMask2;
592 		cMask[3] &= aMask3;
593 	}
594 
fogBlend(Vector4f & c0,Float4 & fog)595 	void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
596 	{
597 		if(!state.fogActive)
598 		{
599 			return;
600 		}
601 
602 		if(state.pixelFogMode != FOG_NONE)
603 		{
604 			pixelFog(fog);
605 
606 			fog = Min(fog, Float4(1.0f));
607 			fog = Max(fog, Float4(0.0f));
608 		}
609 
610 		c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
611 		c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
612 		c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
613 
614 		c0.x *= fog;
615 		c0.y *= fog;
616 		c0.z *= fog;
617 
618 		c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
619 		c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
620 		c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
621 	}
622 
pixelFog(Float4 & visibility)623 	void PixelRoutine::pixelFog(Float4 &visibility)
624 	{
625 		Float4 &zw = visibility;
626 
627 		if(state.pixelFogMode != FOG_NONE)
628 		{
629 			if(state.wBasedFog)
630 			{
631 				zw = rhw;
632 			}
633 			else
634 			{
635 				if(complementaryDepthBuffer)
636 				{
637 					zw = Float4(1.0f) - z[0];
638 				}
639 				else
640 				{
641 					zw = z[0];
642 				}
643 			}
644 		}
645 
646 		switch(state.pixelFogMode)
647 		{
648 		case FOG_NONE:
649 			break;
650 		case FOG_LINEAR:
651 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
652 			zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
653 			break;
654 		case FOG_EXP:
655 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
656 			zw = exponential2(zw, true);
657 			break;
658 		case FOG_EXP2:
659 			zw *= zw;
660 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
661 			zw = exponential2(zw, true);
662 			break;
663 		default:
664 			ASSERT(false);
665 		}
666 	}
667 
writeDepth(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & zMask)668 	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
669 	{
670 		if(!state.depthWriteEnable)
671 		{
672 			return;
673 		}
674 
675 		Float4 Z = z;
676 
677 		if(shader && shader->depthOverride())
678 		{
679 			if(complementaryDepthBuffer)
680 			{
681 				Z = Float4(1.0f) - oDepth;
682 			}
683 			else
684 			{
685 				Z = oDepth;
686 			}
687 		}
688 
689 		Pointer<Byte> buffer;
690 		Int pitch;
691 
692 		if(!state.quadLayoutDepthBuffer)
693 		{
694 			buffer = zBuffer + 4 * x;
695 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
696 		}
697 		else
698 		{
699 			buffer = zBuffer + 8 * x;
700 		}
701 
702 		if(q > 0)
703 		{
704 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
705 		}
706 
707 		Float4 zValue;
708 
709 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
710 		{
711 			if(!state.quadLayoutDepthBuffer)
712 			{
713 				// FIXME: Properly optimizes?
714 				zValue.xy = *Pointer<Float4>(buffer);
715 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
716 			}
717 			else
718 			{
719 				zValue = *Pointer<Float4>(buffer, 16);
720 			}
721 		}
722 
723 		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
724 		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
725 		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
726 
727 		if(!state.quadLayoutDepthBuffer)
728 		{
729 			// FIXME: Properly optimizes?
730 			*Pointer<Float2>(buffer) = Float2(Z.xy);
731 			*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
732 		}
733 		else
734 		{
735 			*Pointer<Float4>(buffer, 16) = Z;
736 		}
737 	}
738 
writeStencil(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & zMask,Int & cMask)739 	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
740 	{
741 		if(!state.stencilActive)
742 		{
743 			return;
744 		}
745 
746 		if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
747 		{
748 			if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
749 			{
750 				return;
751 			}
752 		}
753 
754 		if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
755 		{
756 			return;
757 		}
758 
759 		Pointer<Byte> buffer = sBuffer + 2 * x;
760 
761 		if(q > 0)
762 		{
763 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
764 		}
765 
766 		Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
767 
768 		Byte8 newValue;
769 		stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
770 
771 		if(!state.noStencilWriteMask)
772 		{
773 			Byte8 maskedValue = bufferValue;
774 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
775 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
776 			newValue |= maskedValue;
777 		}
778 
779 		if(state.twoSidedStencil)
780 		{
781 			Byte8 newValueCCW;
782 
783 			stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
784 
785 			if(!state.noStencilWriteMaskCCW)
786 			{
787 				Byte8 maskedValue = bufferValue;
788 				newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
789 				maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
790 				newValueCCW |= maskedValue;
791 			}
792 
793 			newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
794 			newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
795 			newValue |= newValueCCW;
796 		}
797 
798 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
799 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
800 		newValue |= bufferValue;
801 
802 		*Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
803 	}
804 
stencilOperation(Byte8 & newValue,Byte8 & bufferValue,StencilOperation stencilPassOperation,StencilOperation stencilZFailOperation,StencilOperation stencilFailOperation,bool CCW,Int & zMask,Int & sMask)805 	void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
806 	{
807 		Byte8 &pass = newValue;
808 		Byte8 fail;
809 		Byte8 zFail;
810 
811 		stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
812 
813 		if(stencilZFailOperation != stencilPassOperation)
814 		{
815 			stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
816 		}
817 
818 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
819 		{
820 			stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
821 		}
822 
823 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
824 		{
825 			if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
826 			{
827 				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
828 				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
829 				pass |= zFail;
830 			}
831 
832 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
833 			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
834 			pass |= fail;
835 		}
836 	}
837 
stencilOperation(Byte8 & output,Byte8 & bufferValue,StencilOperation operation,bool CCW)838 	void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
839 	{
840 		switch(operation)
841 		{
842 		case OPERATION_KEEP:
843 			output = bufferValue;
844 			break;
845 		case OPERATION_ZERO:
846 			output = Byte8(0x0000000000000000);
847 			break;
848 		case OPERATION_REPLACE:
849 			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
850 			break;
851 		case OPERATION_INCRSAT:
852 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
853 			break;
854 		case OPERATION_DECRSAT:
855 			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
856 			break;
857 		case OPERATION_INVERT:
858 			output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
859 			break;
860 		case OPERATION_INCR:
861 			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
862 			break;
863 		case OPERATION_DECR:
864 			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
865 			break;
866 		default:
867 			ASSERT(false);
868 		}
869 	}
870 
blendFactor(const Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorActive)871 	void PixelRoutine::blendFactor(const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
872 	{
873 		switch(blendFactorActive)
874 		{
875 		case BLEND_ZERO:
876 			// Optimized
877 			break;
878 		case BLEND_ONE:
879 			// Optimized
880 			break;
881 		case BLEND_SOURCE:
882 			blendFactor.x = current.x;
883 			blendFactor.y = current.y;
884 			blendFactor.z = current.z;
885 			break;
886 		case BLEND_INVSOURCE:
887 			blendFactor.x = Short4(0xFFFFu) - current.x;
888 			blendFactor.y = Short4(0xFFFFu) - current.y;
889 			blendFactor.z = Short4(0xFFFFu) - current.z;
890 			break;
891 		case BLEND_DEST:
892 			blendFactor.x = pixel.x;
893 			blendFactor.y = pixel.y;
894 			blendFactor.z = pixel.z;
895 			break;
896 		case BLEND_INVDEST:
897 			blendFactor.x = Short4(0xFFFFu) - pixel.x;
898 			blendFactor.y = Short4(0xFFFFu) - pixel.y;
899 			blendFactor.z = Short4(0xFFFFu) - pixel.z;
900 			break;
901 		case BLEND_SOURCEALPHA:
902 			blendFactor.x = current.w;
903 			blendFactor.y = current.w;
904 			blendFactor.z = current.w;
905 			break;
906 		case BLEND_INVSOURCEALPHA:
907 			blendFactor.x = Short4(0xFFFFu) - current.w;
908 			blendFactor.y = Short4(0xFFFFu) - current.w;
909 			blendFactor.z = Short4(0xFFFFu) - current.w;
910 			break;
911 		case BLEND_DESTALPHA:
912 			blendFactor.x = pixel.w;
913 			blendFactor.y = pixel.w;
914 			blendFactor.z = pixel.w;
915 			break;
916 		case BLEND_INVDESTALPHA:
917 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
918 			blendFactor.y = Short4(0xFFFFu) - pixel.w;
919 			blendFactor.z = Short4(0xFFFFu) - pixel.w;
920 			break;
921 		case BLEND_SRCALPHASAT:
922 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
923 			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
924 			blendFactor.y = blendFactor.x;
925 			blendFactor.z = blendFactor.x;
926 			break;
927 		case BLEND_CONSTANT:
928 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
929 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
930 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
931 			break;
932 		case BLEND_INVCONSTANT:
933 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
934 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
935 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
936 			break;
937 		case BLEND_CONSTANTALPHA:
938 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
939 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
940 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
941 			break;
942 		case BLEND_INVCONSTANTALPHA:
943 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
944 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
945 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
946 			break;
947 		default:
948 			ASSERT(false);
949 		}
950 	}
951 
blendFactorAlpha(const Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorAlphaActive)952 	void PixelRoutine::blendFactorAlpha(const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
953 	{
954 		switch(blendFactorAlphaActive)
955 		{
956 		case BLEND_ZERO:
957 			// Optimized
958 			break;
959 		case BLEND_ONE:
960 			// Optimized
961 			break;
962 		case BLEND_SOURCE:
963 			blendFactor.w = current.w;
964 			break;
965 		case BLEND_INVSOURCE:
966 			blendFactor.w = Short4(0xFFFFu) - current.w;
967 			break;
968 		case BLEND_DEST:
969 			blendFactor.w = pixel.w;
970 			break;
971 		case BLEND_INVDEST:
972 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
973 			break;
974 		case BLEND_SOURCEALPHA:
975 			blendFactor.w = current.w;
976 			break;
977 		case BLEND_INVSOURCEALPHA:
978 			blendFactor.w = Short4(0xFFFFu) - current.w;
979 			break;
980 		case BLEND_DESTALPHA:
981 			blendFactor.w = pixel.w;
982 			break;
983 		case BLEND_INVDESTALPHA:
984 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
985 			break;
986 		case BLEND_SRCALPHASAT:
987 			blendFactor.w = Short4(0xFFFFu);
988 			break;
989 		case BLEND_CONSTANT:
990 		case BLEND_CONSTANTALPHA:
991 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
992 			break;
993 		case BLEND_INVCONSTANT:
994 		case BLEND_INVCONSTANTALPHA:
995 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
996 			break;
997 		default:
998 			ASSERT(false);
999 		}
1000 	}
1001 
isSRGB(int index) const1002 	bool PixelRoutine::isSRGB(int index) const
1003 	{
1004 		return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8;
1005 	}
1006 
readPixel(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & pixel)1007 	void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1008 	{
1009 		Short4 c01;
1010 		Short4 c23;
1011 		Pointer<Byte> buffer;
1012 		Pointer<Byte> buffer2;
1013 
1014 		switch(state.targetFormat[index])
1015 		{
1016 		case FORMAT_R5G6B5:
1017 			buffer = cBuffer + 2 * x;
1018 			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1019 			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1020 
1021 			pixel.x = c01 & Short4(0xF800u);
1022 			pixel.y = (c01 & Short4(0x07E0u)) << 5;
1023 			pixel.z = (c01 & Short4(0x001Fu)) << 11;
1024 			pixel.w = Short4(0xFFFFu);
1025 			break;
1026 		case FORMAT_A8R8G8B8:
1027 			buffer = cBuffer + 4 * x;
1028 			c01 = *Pointer<Short4>(buffer);
1029 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1030 			c23 = *Pointer<Short4>(buffer);
1031 			pixel.z = c01;
1032 			pixel.y = c01;
1033 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1034 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1035 			pixel.x = pixel.z;
1036 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1037 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1038 			pixel.y = pixel.z;
1039 			pixel.w = pixel.x;
1040 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1041 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1042 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1043 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1044 			break;
1045 		case FORMAT_A8B8G8R8:
1046 		case FORMAT_SRGB8_A8:
1047 			buffer = cBuffer + 4 * x;
1048 			c01 = *Pointer<Short4>(buffer);
1049 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1050 			c23 = *Pointer<Short4>(buffer);
1051 			pixel.z = c01;
1052 			pixel.y = c01;
1053 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1054 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1055 			pixel.x = pixel.z;
1056 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1057 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1058 			pixel.y = pixel.z;
1059 			pixel.w = pixel.x;
1060 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1061 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1062 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1063 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1064 			break;
1065 		case FORMAT_A8:
1066 			buffer = cBuffer + 1 * x;
1067 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1068 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1069 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1070 			pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1071 			pixel.x = Short4(0x0000);
1072 			pixel.y = Short4(0x0000);
1073 			pixel.z = Short4(0x0000);
1074 			break;
1075 		case FORMAT_X8R8G8B8:
1076 			buffer = cBuffer + 4 * x;
1077 			c01 = *Pointer<Short4>(buffer);
1078 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1079 			c23 = *Pointer<Short4>(buffer);
1080 			pixel.z = c01;
1081 			pixel.y = c01;
1082 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1083 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1084 			pixel.x = pixel.z;
1085 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1086 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1087 			pixel.y = pixel.z;
1088 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1089 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1090 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1091 			pixel.w = Short4(0xFFFFu);
1092 			break;
1093 		case FORMAT_X8B8G8R8:
1094 		case FORMAT_SRGB8_X8:
1095 			buffer = cBuffer + 4 * x;
1096 			c01 = *Pointer<Short4>(buffer);
1097 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1098 			c23 = *Pointer<Short4>(buffer);
1099 			pixel.z = c01;
1100 			pixel.y = c01;
1101 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1102 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1103 			pixel.x = pixel.z;
1104 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1105 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1106 			pixel.y = pixel.z;
1107 			pixel.w = pixel.x;
1108 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1109 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1110 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1111 			pixel.w = Short4(0xFFFFu);
1112 			break;
1113 		case FORMAT_A8G8R8B8Q:
1114 			UNIMPLEMENTED();
1115 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1116 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1117 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1118 		//	pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1119 			break;
1120 		case FORMAT_X8G8R8B8Q:
1121 			UNIMPLEMENTED();
1122 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1123 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1124 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1125 		//	pixel.w = Short4(0xFFFFu);
1126 			break;
1127 		case FORMAT_A16B16G16R16:
1128 			buffer = cBuffer;
1129 			pixel.x = *Pointer<Short4>(buffer + 8 * x);
1130 			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1131 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1132 			pixel.z = *Pointer<Short4>(buffer + 8 * x);
1133 			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1134 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1135 			break;
1136 		case FORMAT_G16R16:
1137 			buffer = cBuffer;
1138 			pixel.x = *Pointer<Short4>(buffer + 4 * x);
1139 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1140 			pixel.y = *Pointer<Short4>(buffer + 4 * x);
1141 			pixel.z = pixel.x;
1142 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1143 			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1144 			pixel.y = pixel.z;
1145 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1146 			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1147 			pixel.z = Short4(0xFFFFu);
1148 			pixel.w = Short4(0xFFFFu);
1149 			break;
1150 		default:
1151 			ASSERT(false);
1152 		}
1153 
1154 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1155 		{
1156 			sRGBtoLinear16_12_16(pixel);
1157 		}
1158 	}
1159 
alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1160 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1161 	{
1162 		if(!state.alphaBlendActive)
1163 		{
1164 			return;
1165 		}
1166 
1167 		Vector4s pixel;
1168 		readPixel(index, cBuffer, x, pixel);
1169 
1170 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1171 		Vector4s sourceFactor;
1172 		Vector4s destFactor;
1173 
1174 		blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1175 		blendFactor(destFactor, current, pixel, state.destBlendFactor);
1176 
1177 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1178 		{
1179 			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1180 			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1181 			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1182 		}
1183 
1184 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1185 		{
1186 			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1187 			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1188 			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1189 		}
1190 
1191 		switch(state.blendOperation)
1192 		{
1193 		case BLENDOP_ADD:
1194 			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1195 			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1196 			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1197 			break;
1198 		case BLENDOP_SUB:
1199 			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1200 			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1201 			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1202 			break;
1203 		case BLENDOP_INVSUB:
1204 			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1205 			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1206 			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1207 			break;
1208 		case BLENDOP_MIN:
1209 			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1210 			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1211 			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1212 			break;
1213 		case BLENDOP_MAX:
1214 			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1215 			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1216 			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1217 			break;
1218 		case BLENDOP_SOURCE:
1219 			// No operation
1220 			break;
1221 		case BLENDOP_DEST:
1222 			current.x = pixel.x;
1223 			current.y = pixel.y;
1224 			current.z = pixel.z;
1225 			break;
1226 		case BLENDOP_NULL:
1227 			current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1228 			current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1229 			current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1230 			break;
1231 		default:
1232 			ASSERT(false);
1233 		}
1234 
1235 		blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1236 		blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1237 
1238 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1239 		{
1240 			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1241 		}
1242 
1243 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1244 		{
1245 			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1246 		}
1247 
1248 		switch(state.blendOperationAlpha)
1249 		{
1250 		case BLENDOP_ADD:
1251 			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1252 			break;
1253 		case BLENDOP_SUB:
1254 			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1255 			break;
1256 		case BLENDOP_INVSUB:
1257 			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1258 			break;
1259 		case BLENDOP_MIN:
1260 			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1261 			break;
1262 		case BLENDOP_MAX:
1263 			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1264 			break;
1265 		case BLENDOP_SOURCE:
1266 			// No operation
1267 			break;
1268 		case BLENDOP_DEST:
1269 			current.w = pixel.w;
1270 			break;
1271 		case BLENDOP_NULL:
1272 			current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1273 			break;
1274 		default:
1275 			ASSERT(false);
1276 		}
1277 	}
1278 
logicOperation(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1279 	void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1280 	{
1281 		if(state.logicalOperation == LOGICALOP_COPY)
1282 		{
1283 			return;
1284 		}
1285 
1286 		Vector4s pixel;
1287 		readPixel(index, cBuffer, x, pixel);
1288 
1289 		switch(state.logicalOperation)
1290 		{
1291 		case LOGICALOP_CLEAR:
1292 			current.x = 0;
1293 			current.y = 0;
1294 			current.z = 0;
1295 			break;
1296 		case LOGICALOP_SET:
1297 			current.x = 0xFFFFu;
1298 			current.y = 0xFFFFu;
1299 			current.z = 0xFFFFu;
1300 			break;
1301 		case LOGICALOP_COPY:
1302 			ASSERT(false);   // Optimized out
1303 			break;
1304 		case LOGICALOP_COPY_INVERTED:
1305 			current.x = ~current.x;
1306 			current.y = ~current.y;
1307 			current.z = ~current.z;
1308 			break;
1309 		case LOGICALOP_NOOP:
1310 			current.x = pixel.x;
1311 			current.y = pixel.y;
1312 			current.z = pixel.z;
1313 			break;
1314 		case LOGICALOP_INVERT:
1315 			current.x = ~pixel.x;
1316 			current.y = ~pixel.y;
1317 			current.z = ~pixel.z;
1318 			break;
1319 		case LOGICALOP_AND:
1320 			current.x = pixel.x & current.x;
1321 			current.y = pixel.y & current.y;
1322 			current.z = pixel.z & current.z;
1323 			break;
1324 		case LOGICALOP_NAND:
1325 			current.x = ~(pixel.x & current.x);
1326 			current.y = ~(pixel.y & current.y);
1327 			current.z = ~(pixel.z & current.z);
1328 			break;
1329 		case LOGICALOP_OR:
1330 			current.x = pixel.x | current.x;
1331 			current.y = pixel.y | current.y;
1332 			current.z = pixel.z | current.z;
1333 			break;
1334 		case LOGICALOP_NOR:
1335 			current.x = ~(pixel.x | current.x);
1336 			current.y = ~(pixel.y | current.y);
1337 			current.z = ~(pixel.z | current.z);
1338 			break;
1339 		case LOGICALOP_XOR:
1340 			current.x = pixel.x ^ current.x;
1341 			current.y = pixel.y ^ current.y;
1342 			current.z = pixel.z ^ current.z;
1343 			break;
1344 		case LOGICALOP_EQUIV:
1345 			current.x = ~(pixel.x ^ current.x);
1346 			current.y = ~(pixel.y ^ current.y);
1347 			current.z = ~(pixel.z ^ current.z);
1348 			break;
1349 		case LOGICALOP_AND_REVERSE:
1350 			current.x = ~pixel.x & current.x;
1351 			current.y = ~pixel.y & current.y;
1352 			current.z = ~pixel.z & current.z;
1353 			break;
1354 		case LOGICALOP_AND_INVERTED:
1355 			current.x = pixel.x & ~current.x;
1356 			current.y = pixel.y & ~current.y;
1357 			current.z = pixel.z & ~current.z;
1358 			break;
1359 		case LOGICALOP_OR_REVERSE:
1360 			current.x = ~pixel.x | current.x;
1361 			current.y = ~pixel.y | current.y;
1362 			current.z = ~pixel.z | current.z;
1363 			break;
1364 		case LOGICALOP_OR_INVERTED:
1365 			current.x = pixel.x | ~current.x;
1366 			current.y = pixel.y | ~current.y;
1367 			current.z = pixel.z | ~current.z;
1368 			break;
1369 		default:
1370 			ASSERT(false);
1371 		}
1372 	}
1373 
writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & current,Int & sMask,Int & zMask,Int & cMask)1374 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1375 	{
1376 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1377 		{
1378 			linearToSRGB16_12_16(current);
1379 		}
1380 
1381 		if(exactColorRounding)
1382 		{
1383 			switch(state.targetFormat[index])
1384 			{
1385 			case FORMAT_R5G6B5:
1386 				current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1387 				current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1388 				current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1389 				break;
1390 			case FORMAT_X8G8R8B8Q:
1391 			case FORMAT_A8G8R8B8Q:
1392 			case FORMAT_X8R8G8B8:
1393 			case FORMAT_X8B8G8R8:
1394 			case FORMAT_A8R8G8B8:
1395 			case FORMAT_A8B8G8R8:
1396 			case FORMAT_SRGB8_X8:
1397 			case FORMAT_SRGB8_A8:
1398 				current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1399 				current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1400 				current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1401 				current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
1402 				break;
1403 			default:
1404 				break;
1405 			}
1406 		}
1407 
1408 		int rgbaWriteMask = state.colorWriteActive(index);
1409 		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1410 		int brgaWriteMask = (rgbaWriteMask & 0x00000008) | (rgbaWriteMask & 0x00000001) << 1 | (rgbaWriteMask & 0x00000002) << 1 | (rgbaWriteMask & 0x00000004) >> 2;
1411 
1412 		switch(state.targetFormat[index])
1413 		{
1414 		case FORMAT_R5G6B5:
1415 			{
1416 				current.x = current.x & Short4(0xF800u);
1417 				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1418 				current.z = As<UShort4>(current.z) >> 11;
1419 
1420 				current.x = current.x | current.y | current.z;
1421 			}
1422 			break;
1423 		case FORMAT_X8G8R8B8Q:
1424 			UNIMPLEMENTED();
1425 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1426 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1427 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1428 
1429 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1430 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1431 			break;
1432 		case FORMAT_A8G8R8B8Q:
1433 			UNIMPLEMENTED();
1434 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1435 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1436 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1437 		//	current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1438 
1439 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1440 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1441 			break;
1442 		case FORMAT_X8R8G8B8:
1443 		case FORMAT_A8R8G8B8:
1444 			if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1445 			{
1446 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1447 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1448 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1449 
1450 				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1451 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1452 
1453 				current.x = current.z;
1454 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1455 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1456 				current.y = current.z;
1457 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1458 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1459 			}
1460 			else
1461 			{
1462 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1463 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1464 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1465 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1466 
1467 				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1468 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1469 
1470 				current.x = current.z;
1471 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1472 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1473 				current.y = current.z;
1474 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1475 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1476 			}
1477 			break;
1478 		case FORMAT_X8B8G8R8:
1479 		case FORMAT_A8B8G8R8:
1480 		case FORMAT_SRGB8_X8:
1481 		case FORMAT_SRGB8_A8:
1482 			if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1483 			{
1484 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1485 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1486 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1487 
1488 				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1489 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1490 
1491 				current.x = current.z;
1492 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1493 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1494 				current.y = current.z;
1495 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1496 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1497 			}
1498 			else
1499 			{
1500 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1501 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1502 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1503 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1504 
1505 				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1506 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1507 
1508 				current.x = current.z;
1509 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1510 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1511 				current.y = current.z;
1512 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1513 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1514 			}
1515 			break;
1516 		case FORMAT_A8:
1517 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1518 			current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1519 			break;
1520 		case FORMAT_G16R16:
1521 			current.z = current.x;
1522 			current.x = As<Short4>(UnpackLow(current.x, current.y));
1523 			current.z = As<Short4>(UnpackHigh(current.z, current.y));
1524 			current.y = current.z;
1525 			break;
1526 		case FORMAT_A16B16G16R16:
1527 			transpose4x4(current.x, current.y, current.z, current.w);
1528 			break;
1529 		default:
1530 			ASSERT(false);
1531 		}
1532 
1533 		Short4 c01 = current.z;
1534 		Short4 c23 = current.y;
1535 
1536 		Int xMask;   // Combination of all masks
1537 
1538 		if(state.depthTestActive)
1539 		{
1540 			xMask = zMask;
1541 		}
1542 		else
1543 		{
1544 			xMask = cMask;
1545 		}
1546 
1547 		if(state.stencilActive)
1548 		{
1549 			xMask &= sMask;
1550 		}
1551 
1552 		switch(state.targetFormat[index])
1553 		{
1554 		case FORMAT_R5G6B5:
1555 			{
1556 				Pointer<Byte> buffer = cBuffer + 2 * x;
1557 				Int value = *Pointer<Int>(buffer);
1558 
1559 				Int c01 = Extract(As<Int2>(current.x), 0);
1560 
1561 				if((bgraWriteMask & 0x00000007) != 0x00000007)
1562 				{
1563 					Int masked = value;
1564 					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1565 					masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1566 					c01 |= masked;
1567 				}
1568 
1569 				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1570 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1571 				c01 |= value;
1572 				*Pointer<Int>(buffer) = c01;
1573 
1574 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1575 				value = *Pointer<Int>(buffer);
1576 
1577 				Int c23 = Extract(As<Int2>(current.x), 1);
1578 
1579 				if((bgraWriteMask & 0x00000007) != 0x00000007)
1580 				{
1581 					Int masked = value;
1582 					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1583 					masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1584 					c23 |= masked;
1585 				}
1586 
1587 				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1588 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1589 				c23 |= value;
1590 				*Pointer<Int>(buffer) = c23;
1591 			}
1592 			break;
1593 		case FORMAT_A8G8R8B8Q:
1594 		case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1595 			UNIMPLEMENTED();
1596 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1597 
1598 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1599 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1600 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1601 		//	{
1602 		//		Short4 masked = value;
1603 		//		c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1604 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1605 		//		c01 |= masked;
1606 		//	}
1607 
1608 		//	c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1609 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1610 		//	c01 |= value;
1611 		//	*Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1612 
1613 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1614 
1615 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1616 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1617 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1618 		//	{
1619 		//		Short4 masked = value;
1620 		//		c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1621 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1622 		//		c23 |= masked;
1623 		//	}
1624 
1625 		//	c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1626 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1627 		//	c23 |= value;
1628 		//	*Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1629 			break;
1630 		case FORMAT_A8R8G8B8:
1631 		case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1632 			{
1633 				Pointer<Byte> buffer = cBuffer + x * 4;
1634 				Short4 value = *Pointer<Short4>(buffer);
1635 
1636 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1637 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1638 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1639 				{
1640 					Short4 masked = value;
1641 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1642 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1643 					c01 |= masked;
1644 				}
1645 
1646 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1647 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1648 				c01 |= value;
1649 				*Pointer<Short4>(buffer) = c01;
1650 
1651 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1652 				value = *Pointer<Short4>(buffer);
1653 
1654 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1655 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1656 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1657 				{
1658 					Short4 masked = value;
1659 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1660 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1661 					c23 |= masked;
1662 				}
1663 
1664 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1665 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1666 				c23 |= value;
1667 				*Pointer<Short4>(buffer) = c23;
1668 			}
1669 			break;
1670 		case FORMAT_A8B8G8R8:
1671 		case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1672 		case FORMAT_SRGB8_X8:
1673 		case FORMAT_SRGB8_A8:
1674 			{
1675 				Pointer<Byte> buffer = cBuffer + x * 4;
1676 				Short4 value = *Pointer<Short4>(buffer);
1677 
1678 				bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1679 				              (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1680 				               ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1681 
1682 				if(masked)
1683 				{
1684 					Short4 masked = value;
1685 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1686 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1687 					c01 |= masked;
1688 				}
1689 
1690 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1691 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1692 				c01 |= value;
1693 				*Pointer<Short4>(buffer) = c01;
1694 
1695 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1696 				value = *Pointer<Short4>(buffer);
1697 
1698 				if(masked)
1699 				{
1700 					Short4 masked = value;
1701 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1702 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1703 					c23 |= masked;
1704 				}
1705 
1706 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1707 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1708 				c23 |= value;
1709 				*Pointer<Short4>(buffer) = c23;
1710 			}
1711 			break;
1712 		case FORMAT_A8:
1713 			if(rgbaWriteMask & 0x00000008)
1714 			{
1715 				Pointer<Byte> buffer = cBuffer + 1 * x;
1716 				Short4 value;
1717 				Insert(value, *Pointer<Short>(buffer), 0);
1718 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1719 				Insert(value, *Pointer<Short>(buffer + pitch), 1);
1720 				value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1721 
1722 				current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1723 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1724 				current.w |= value;
1725 
1726 				*Pointer<Short>(buffer) = Extract(current.w, 0);
1727 				*Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1728 			}
1729 			break;
1730 		case FORMAT_G16R16:
1731 			{
1732 				Pointer<Byte> buffer = cBuffer + 4 * x;
1733 
1734 				Short4 value = *Pointer<Short4>(buffer);
1735 
1736 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1737 				{
1738 					Short4 masked = value;
1739 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1740 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1741 					current.x |= masked;
1742 				}
1743 
1744 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1745 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1746 				current.x |= value;
1747 				*Pointer<Short4>(buffer) = current.x;
1748 
1749 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1750 
1751 				value = *Pointer<Short4>(buffer);
1752 
1753 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1754 				{
1755 					Short4 masked = value;
1756 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1757 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1758 					current.y |= masked;
1759 				}
1760 
1761 				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1762 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1763 				current.y |= value;
1764 				*Pointer<Short4>(buffer) = current.y;
1765 			}
1766 			break;
1767 		case FORMAT_A16B16G16R16:
1768 			{
1769 				Pointer<Byte> buffer = cBuffer + 8 * x;
1770 
1771 				{
1772 					Short4 value = *Pointer<Short4>(buffer);
1773 
1774 					if(rgbaWriteMask != 0x0000000F)
1775 					{
1776 						Short4 masked = value;
1777 						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1778 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1779 						current.x |= masked;
1780 					}
1781 
1782 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1783 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1784 					current.x |= value;
1785 					*Pointer<Short4>(buffer) = current.x;
1786 				}
1787 
1788 				{
1789 					Short4 value = *Pointer<Short4>(buffer + 8);
1790 
1791 					if(rgbaWriteMask != 0x0000000F)
1792 					{
1793 						Short4 masked = value;
1794 						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1795 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1796 						current.y |= masked;
1797 					}
1798 
1799 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1800 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1801 					current.y |= value;
1802 					*Pointer<Short4>(buffer + 8) = current.y;
1803 				}
1804 
1805 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1806 
1807 				{
1808 					Short4 value = *Pointer<Short4>(buffer);
1809 
1810 					if(rgbaWriteMask != 0x0000000F)
1811 					{
1812 						Short4 masked = value;
1813 						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1814 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1815 						current.z |= masked;
1816 					}
1817 
1818 					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1819 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1820 					current.z |= value;
1821 					*Pointer<Short4>(buffer) = current.z;
1822 				}
1823 
1824 				{
1825 					Short4 value = *Pointer<Short4>(buffer + 8);
1826 
1827 					if(rgbaWriteMask != 0x0000000F)
1828 					{
1829 						Short4 masked = value;
1830 						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1831 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1832 						current.w |= masked;
1833 					}
1834 
1835 					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1836 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1837 					current.w |= value;
1838 					*Pointer<Short4>(buffer + 8) = current.w;
1839 				}
1840 			}
1841 			break;
1842 		default:
1843 			ASSERT(false);
1844 		}
1845 	}
1846 
blendFactor(const Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorActive)1847 	void PixelRoutine::blendFactor(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1848 	{
1849 		switch(blendFactorActive)
1850 		{
1851 		case BLEND_ZERO:
1852 			// Optimized
1853 			break;
1854 		case BLEND_ONE:
1855 			// Optimized
1856 			break;
1857 		case BLEND_SOURCE:
1858 			blendFactor.x = oC.x;
1859 			blendFactor.y = oC.y;
1860 			blendFactor.z = oC.z;
1861 			break;
1862 		case BLEND_INVSOURCE:
1863 			blendFactor.x = Float4(1.0f) - oC.x;
1864 			blendFactor.y = Float4(1.0f) - oC.y;
1865 			blendFactor.z = Float4(1.0f) - oC.z;
1866 			break;
1867 		case BLEND_DEST:
1868 			blendFactor.x = pixel.x;
1869 			blendFactor.y = pixel.y;
1870 			blendFactor.z = pixel.z;
1871 			break;
1872 		case BLEND_INVDEST:
1873 			blendFactor.x = Float4(1.0f) - pixel.x;
1874 			blendFactor.y = Float4(1.0f) - pixel.y;
1875 			blendFactor.z = Float4(1.0f) - pixel.z;
1876 			break;
1877 		case BLEND_SOURCEALPHA:
1878 			blendFactor.x = oC.w;
1879 			blendFactor.y = oC.w;
1880 			blendFactor.z = oC.w;
1881 			break;
1882 		case BLEND_INVSOURCEALPHA:
1883 			blendFactor.x = Float4(1.0f) - oC.w;
1884 			blendFactor.y = Float4(1.0f) - oC.w;
1885 			blendFactor.z = Float4(1.0f) - oC.w;
1886 			break;
1887 		case BLEND_DESTALPHA:
1888 			blendFactor.x = pixel.w;
1889 			blendFactor.y = pixel.w;
1890 			blendFactor.z = pixel.w;
1891 			break;
1892 		case BLEND_INVDESTALPHA:
1893 			blendFactor.x = Float4(1.0f) - pixel.w;
1894 			blendFactor.y = Float4(1.0f) - pixel.w;
1895 			blendFactor.z = Float4(1.0f) - pixel.w;
1896 			break;
1897 		case BLEND_SRCALPHASAT:
1898 			blendFactor.x = Float4(1.0f) - pixel.w;
1899 			blendFactor.x = Min(blendFactor.x, oC.w);
1900 			blendFactor.y = blendFactor.x;
1901 			blendFactor.z = blendFactor.x;
1902 			break;
1903 		case BLEND_CONSTANT:
1904 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1905 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1906 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1907 			break;
1908 		case BLEND_INVCONSTANT:
1909 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1910 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1911 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1912 			break;
1913 		default:
1914 			ASSERT(false);
1915 		}
1916 	}
1917 
blendFactorAlpha(const Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorAlphaActive)1918 	void PixelRoutine::blendFactorAlpha(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1919 	{
1920 		switch(blendFactorAlphaActive)
1921 		{
1922 		case BLEND_ZERO:
1923 			// Optimized
1924 			break;
1925 		case BLEND_ONE:
1926 			// Optimized
1927 			break;
1928 		case BLEND_SOURCE:
1929 			blendFactor.w = oC.w;
1930 			break;
1931 		case BLEND_INVSOURCE:
1932 			blendFactor.w = Float4(1.0f) - oC.w;
1933 			break;
1934 		case BLEND_DEST:
1935 			blendFactor.w = pixel.w;
1936 			break;
1937 		case BLEND_INVDEST:
1938 			blendFactor.w = Float4(1.0f) - pixel.w;
1939 			break;
1940 		case BLEND_SOURCEALPHA:
1941 			blendFactor.w = oC.w;
1942 			break;
1943 		case BLEND_INVSOURCEALPHA:
1944 			blendFactor.w = Float4(1.0f) - oC.w;
1945 			break;
1946 		case BLEND_DESTALPHA:
1947 			blendFactor.w = pixel.w;
1948 			break;
1949 		case BLEND_INVDESTALPHA:
1950 			blendFactor.w = Float4(1.0f) - pixel.w;
1951 			break;
1952 		case BLEND_SRCALPHASAT:
1953 			blendFactor.w = Float4(1.0f);
1954 			break;
1955 		case BLEND_CONSTANT:
1956 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
1957 			break;
1958 		case BLEND_INVCONSTANT:
1959 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1960 			break;
1961 		default:
1962 			ASSERT(false);
1963 		}
1964 	}
1965 
alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4f & oC,Int & x)1966 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
1967 	{
1968 		if(!state.alphaBlendActive)
1969 		{
1970 			return;
1971 		}
1972 
1973 		Pointer<Byte> buffer;
1974 		Vector4f pixel;
1975 
1976 		Vector4s color;
1977 		Short4 c01;
1978 		Short4 c23;
1979 
1980 		Float4 one;
1981 		switch(state.targetFormat[index])
1982 		{
1983 		case FORMAT_R32I:
1984 		case FORMAT_G32R32I:
1985 			one = As<Float4>(Int4(0x7FFFFFFF));
1986 			break;
1987 		case FORMAT_R32UI:
1988 		case FORMAT_G32R32UI:
1989 			one = As<Float4>(Int4(0xFFFFFFFF));
1990 			break;
1991 		case FORMAT_R32F:
1992 		case FORMAT_G32R32F:
1993 			one = Float4(1.0f);
1994 			break;
1995 		}
1996 
1997 		switch(state.targetFormat[index])
1998 		{
1999 		case FORMAT_R32I:
2000 		case FORMAT_R32UI:
2001 		case FORMAT_R32F:
2002 			buffer = cBuffer;
2003 			// FIXME: movlps
2004 			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2005 			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2006 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2007 			// FIXME: movhps
2008 			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2009 			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2010 			pixel.y = pixel.z = pixel.w = one;
2011 			break;
2012 		case FORMAT_G32R32I:
2013 		case FORMAT_G32R32UI:
2014 		case FORMAT_G32R32F:
2015 			buffer = cBuffer;
2016 			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2017 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2018 			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2019 			pixel.z = pixel.x;
2020 			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2021 			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2022 			pixel.y = pixel.z;
2023 			pixel.z = pixel.w = one;
2024 			break;
2025 		case FORMAT_X32B32G32R32F:
2026 		case FORMAT_A32B32G32R32F:
2027 		case FORMAT_A32B32G32R32I:
2028 		case FORMAT_A32B32G32R32UI:
2029 			buffer = cBuffer;
2030 			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2031 			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2032 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2033 			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2034 			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2035 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2036 			if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
2037 			{
2038 				pixel.w = Float4(1.0f);
2039 			}
2040 			break;
2041 		default:
2042 			ASSERT(false);
2043 		}
2044 
2045 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2046 		{
2047 			sRGBtoLinear(pixel.x);
2048 			sRGBtoLinear(pixel.y);
2049 			sRGBtoLinear(pixel.z);
2050 		}
2051 
2052 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2053 		Vector4f sourceFactor;
2054 		Vector4f destFactor;
2055 
2056 		blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2057 		blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2058 
2059 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2060 		{
2061 			oC.x *= sourceFactor.x;
2062 			oC.y *= sourceFactor.y;
2063 			oC.z *= sourceFactor.z;
2064 		}
2065 
2066 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2067 		{
2068 			pixel.x *= destFactor.x;
2069 			pixel.y *= destFactor.y;
2070 			pixel.z *= destFactor.z;
2071 		}
2072 
2073 		switch(state.blendOperation)
2074 		{
2075 		case BLENDOP_ADD:
2076 			oC.x += pixel.x;
2077 			oC.y += pixel.y;
2078 			oC.z += pixel.z;
2079 			break;
2080 		case BLENDOP_SUB:
2081 			oC.x -= pixel.x;
2082 			oC.y -= pixel.y;
2083 			oC.z -= pixel.z;
2084 			break;
2085 		case BLENDOP_INVSUB:
2086 			oC.x = pixel.x - oC.x;
2087 			oC.y = pixel.y - oC.y;
2088 			oC.z = pixel.z - oC.z;
2089 			break;
2090 		case BLENDOP_MIN:
2091 			oC.x = Min(oC.x, pixel.x);
2092 			oC.y = Min(oC.y, pixel.y);
2093 			oC.z = Min(oC.z, pixel.z);
2094 			break;
2095 		case BLENDOP_MAX:
2096 			oC.x = Max(oC.x, pixel.x);
2097 			oC.y = Max(oC.y, pixel.y);
2098 			oC.z = Max(oC.z, pixel.z);
2099 			break;
2100 		case BLENDOP_SOURCE:
2101 			// No operation
2102 			break;
2103 		case BLENDOP_DEST:
2104 			oC.x = pixel.x;
2105 			oC.y = pixel.y;
2106 			oC.z = pixel.z;
2107 			break;
2108 		case BLENDOP_NULL:
2109 			oC.x = Float4(0.0f);
2110 			oC.y = Float4(0.0f);
2111 			oC.z = Float4(0.0f);
2112 			break;
2113 		default:
2114 			ASSERT(false);
2115 		}
2116 
2117 		blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2118 		blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2119 
2120 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2121 		{
2122 			oC.w *= sourceFactor.w;
2123 		}
2124 
2125 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2126 		{
2127 			pixel.w *= destFactor.w;
2128 		}
2129 
2130 		switch(state.blendOperationAlpha)
2131 		{
2132 		case BLENDOP_ADD:
2133 			oC.w += pixel.w;
2134 			break;
2135 		case BLENDOP_SUB:
2136 			oC.w -= pixel.w;
2137 			break;
2138 		case BLENDOP_INVSUB:
2139 			pixel.w -= oC.w;
2140 			oC.w = pixel.w;
2141 			break;
2142 		case BLENDOP_MIN:
2143 			oC.w = Min(oC.w, pixel.w);
2144 			break;
2145 		case BLENDOP_MAX:
2146 			oC.w = Max(oC.w, pixel.w);
2147 			break;
2148 		case BLENDOP_SOURCE:
2149 			// No operation
2150 			break;
2151 		case BLENDOP_DEST:
2152 			oC.w = pixel.w;
2153 			break;
2154 		case BLENDOP_NULL:
2155 			oC.w = Float4(0.0f);
2156 			break;
2157 		default:
2158 			ASSERT(false);
2159 		}
2160 	}
2161 
writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4f & oC,Int & sMask,Int & zMask,Int & cMask)2162 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2163 	{
2164 		switch(state.targetFormat[index])
2165 		{
2166 		case FORMAT_R32F:
2167 		case FORMAT_R32I:
2168 		case FORMAT_R32UI:
2169 			break;
2170 		case FORMAT_G32R32F:
2171 		case FORMAT_G32R32I:
2172 		case FORMAT_G32R32UI:
2173 			oC.z = oC.x;
2174 			oC.x = UnpackLow(oC.x, oC.y);
2175 			oC.z = UnpackHigh(oC.z, oC.y);
2176 			oC.y = oC.z;
2177 			break;
2178 		case FORMAT_X32B32G32R32F:
2179 		case FORMAT_A32B32G32R32F:
2180 		case FORMAT_A32B32G32R32I:
2181 		case FORMAT_A32B32G32R32UI:
2182 			transpose4x4(oC.x, oC.y, oC.z, oC.w);
2183 			break;
2184 		default:
2185 			ASSERT(false);
2186 		}
2187 
2188 		int rgbaWriteMask = state.colorWriteActive(index);
2189 
2190 		Int xMask;   // Combination of all masks
2191 
2192 		if(state.depthTestActive)
2193 		{
2194 			xMask = zMask;
2195 		}
2196 		else
2197 		{
2198 			xMask = cMask;
2199 		}
2200 
2201 		if(state.stencilActive)
2202 		{
2203 			xMask &= sMask;
2204 		}
2205 
2206 		Pointer<Byte> buffer;
2207 		Float4 value;
2208 
2209 		switch(state.targetFormat[index])
2210 		{
2211 		case FORMAT_R32F:
2212 		case FORMAT_R32I:
2213 		case FORMAT_R32UI:
2214 			if(rgbaWriteMask & 0x00000001)
2215 			{
2216 				buffer = cBuffer + 4 * x;
2217 
2218 				// FIXME: movlps
2219 				value.x = *Pointer<Float>(buffer + 0);
2220 				value.y = *Pointer<Float>(buffer + 4);
2221 
2222 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2223 
2224 				// FIXME: movhps
2225 				value.z = *Pointer<Float>(buffer + 0);
2226 				value.w = *Pointer<Float>(buffer + 4);
2227 
2228 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2229 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2230 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2231 
2232 				// FIXME: movhps
2233 				*Pointer<Float>(buffer + 0) = oC.x.z;
2234 				*Pointer<Float>(buffer + 4) = oC.x.w;
2235 
2236 				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2237 
2238 				// FIXME: movlps
2239 				*Pointer<Float>(buffer + 0) = oC.x.x;
2240 				*Pointer<Float>(buffer + 4) = oC.x.y;
2241 			}
2242 			break;
2243 		case FORMAT_G32R32F:
2244 		case FORMAT_G32R32I:
2245 		case FORMAT_G32R32UI:
2246 			buffer = cBuffer + 8 * x;
2247 
2248 			value = *Pointer<Float4>(buffer);
2249 
2250 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2251 			{
2252 				Float4 masked = value;
2253 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2254 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2255 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2256 			}
2257 
2258 			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2259 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2260 			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2261 			*Pointer<Float4>(buffer) = oC.x;
2262 
2263 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2264 
2265 			value = *Pointer<Float4>(buffer);
2266 
2267 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2268 			{
2269 				Float4 masked;
2270 
2271 				masked = value;
2272 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2273 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2274 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2275 			}
2276 
2277 			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2278 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2279 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2280 			*Pointer<Float4>(buffer) = oC.y;
2281 			break;
2282 		case FORMAT_X32B32G32R32F:
2283 		case FORMAT_A32B32G32R32F:
2284 		case FORMAT_A32B32G32R32I:
2285 		case FORMAT_A32B32G32R32UI:
2286 			buffer = cBuffer + 16 * x;
2287 
2288 			{
2289 				value = *Pointer<Float4>(buffer, 16);
2290 
2291 				if(rgbaWriteMask != 0x0000000F)
2292 				{
2293 					Float4 masked = value;
2294 					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2295 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2296 					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2297 				}
2298 
2299 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2300 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2301 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2302 				*Pointer<Float4>(buffer, 16) = oC.x;
2303 			}
2304 
2305 			{
2306 				value = *Pointer<Float4>(buffer + 16, 16);
2307 
2308 				if(rgbaWriteMask != 0x0000000F)
2309 				{
2310 					Float4 masked = value;
2311 					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2312 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2313 					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2314 				}
2315 
2316 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2317 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2318 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2319 				*Pointer<Float4>(buffer + 16, 16) = oC.y;
2320 			}
2321 
2322 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2323 
2324 			{
2325 				value = *Pointer<Float4>(buffer, 16);
2326 
2327 				if(rgbaWriteMask != 0x0000000F)
2328 				{
2329 					Float4 masked = value;
2330 					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2331 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2332 					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2333 				}
2334 
2335 				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2336 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2337 				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2338 				*Pointer<Float4>(buffer, 16) = oC.z;
2339 			}
2340 
2341 			{
2342 				value = (state.targetFormat[index] == FORMAT_X32B32G32R32F) ? Float4(1.0f) : *Pointer<Float4>(buffer + 16, 16);
2343 
2344 				if(rgbaWriteMask != 0x0000000F)
2345 				{
2346 					Float4 masked = value;
2347 					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2348 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2349 					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2350 				}
2351 
2352 				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2353 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2354 				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2355 				*Pointer<Float4>(buffer + 16, 16) = oC.w;
2356 			}
2357 			break;
2358 		default:
2359 			ASSERT(false);
2360 		}
2361 	}
2362 
convertFixed16(Float4 & cf,bool saturate)2363 	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2364 	{
2365 		return UShort4(cf * Float4(0xFFFF), saturate);
2366 	}
2367 
sRGBtoLinear16_12_16(Vector4s & c)2368 	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2369 	{
2370 		c.x = As<UShort4>(c.x) >> 4;
2371 		c.y = As<UShort4>(c.y) >> 4;
2372 		c.z = As<UShort4>(c.z) >> 4;
2373 
2374 		sRGBtoLinear12_16(c);
2375 	}
2376 
sRGBtoLinear12_16(Vector4s & c)2377 	void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
2378 	{
2379 		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2380 
2381 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2382 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2383 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2384 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2385 
2386 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2387 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2388 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2389 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2390 
2391 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2392 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2393 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2394 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2395 	}
2396 
linearToSRGB16_12_16(Vector4s & c)2397 	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2398 	{
2399 		c.x = As<UShort4>(c.x) >> 4;
2400 		c.y = As<UShort4>(c.y) >> 4;
2401 		c.z = As<UShort4>(c.z) >> 4;
2402 
2403 		linearToSRGB12_16(c);
2404 	}
2405 
linearToSRGB12_16(Vector4s & c)2406 	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2407 	{
2408 		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2409 
2410 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2411 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2412 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2413 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2414 
2415 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2416 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2417 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2418 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2419 
2420 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2421 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2422 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2423 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2424 	}
2425 
sRGBtoLinear(const Float4 & x)2426 	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2427 	{
2428 		Float4 linear = x * x;
2429 		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2430 
2431 		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2432 	}
2433 
colorUsed()2434 	bool PixelRoutine::colorUsed()
2435 	{
2436 		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2437 	}
2438 }
2439