1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelPipeline.hpp"
16 #include "Renderer.hpp"
17 #include "SamplerCore.hpp"
18 
19 namespace sw
20 {
21 	extern bool postBlendSRGB;
22 
setBuiltins(Int & x,Int & y,Float4 (& z)[4],Float4 & w)23 	void PixelPipeline::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w)
24 	{
25 		if(state.color[0].component & 0x1) diffuse.x = convertFixed12(v[0].x); else diffuse.x = Short4(0x1000);
26 		if(state.color[0].component & 0x2) diffuse.y = convertFixed12(v[0].y); else diffuse.y = Short4(0x1000);
27 		if(state.color[0].component & 0x4) diffuse.z = convertFixed12(v[0].z); else diffuse.z = Short4(0x1000);
28 		if(state.color[0].component & 0x8) diffuse.w = convertFixed12(v[0].w); else diffuse.w = Short4(0x1000);
29 
30 		if(state.color[1].component & 0x1) specular.x = convertFixed12(v[1].x); else specular.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
31 		if(state.color[1].component & 0x2) specular.y = convertFixed12(v[1].y); else specular.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
32 		if(state.color[1].component & 0x4) specular.z = convertFixed12(v[1].z); else specular.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
33 		if(state.color[1].component & 0x8) specular.w = convertFixed12(v[1].w); else specular.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
34 	}
35 
fixedFunction()36 	void PixelPipeline::fixedFunction()
37 	{
38 		current = diffuse;
39 		Vector4s temp(0x0000, 0x0000, 0x0000, 0x0000);
40 
41 		for(int stage = 0; stage < 8; stage++)
42 		{
43 			if(state.textureStage[stage].stageOperation == TextureStage::STAGE_DISABLE)
44 			{
45 				break;
46 			}
47 
48 			Vector4s texture;
49 
50 			if(state.textureStage[stage].usesTexture)
51 			{
52 				sampleTexture(texture, stage, stage);
53 			}
54 
55 			blendTexture(temp, texture, stage);
56 		}
57 
58 		specularPixel(current, specular);
59 	}
60 
applyShader(Int cMask[4])61 	void PixelPipeline::applyShader(Int cMask[4])
62 	{
63 		if(!shader)
64 		{
65 			fixedFunction();
66 			return;
67 		}
68 
69 		int pad = 0;        // Count number of texm3x3pad instructions
70 		Vector4s dPairing;   // Destination for first pairing instruction
71 
72 		for(size_t i = 0; i < shader->getLength(); i++)
73 		{
74 			const Shader::Instruction *instruction = shader->getInstruction(i);
75 			Shader::Opcode opcode = instruction->opcode;
76 
77 			//	#ifndef NDEBUG   // FIXME: Centralize debug output control
78 			//		shader->printInstruction(i, "debug.txt");
79 			//	#endif
80 
81 			if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
82 			{
83 				continue;
84 			}
85 
86 			const Dst &dst = instruction->dst;
87 			const Src &src0 = instruction->src[0];
88 			const Src &src1 = instruction->src[1];
89 			const Src &src2 = instruction->src[2];
90 
91 			unsigned short version = shader->getVersion();
92 			bool pairing = i + 1 < shader->getLength() && shader->getInstruction(i + 1)->coissue;   // First instruction of pair
93 			bool coissue = instruction->coissue;                                                              // Second instruction of pair
94 
95 			Vector4s d;
96 			Vector4s s0;
97 			Vector4s s1;
98 			Vector4s s2;
99 
100 			if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegister(src0);
101 			if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegister(src1);
102 			if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegister(src2);
103 
104 			Float4 x = version < 0x0104 ? v[2 + dst.index].x : v[2 + src0.index].x;
105 			Float4 y = version < 0x0104 ? v[2 + dst.index].y : v[2 + src0.index].y;
106 			Float4 z = version < 0x0104 ? v[2 + dst.index].z : v[2 + src0.index].z;
107 			Float4 w = version < 0x0104 ? v[2 + dst.index].w : v[2 + src0.index].w;
108 
109 			switch(opcode)
110 			{
111 			case Shader::OPCODE_PS_1_0: break;
112 			case Shader::OPCODE_PS_1_1: break;
113 			case Shader::OPCODE_PS_1_2: break;
114 			case Shader::OPCODE_PS_1_3: break;
115 			case Shader::OPCODE_PS_1_4: break;
116 
117 			case Shader::OPCODE_DEF:    break;
118 
119 			case Shader::OPCODE_NOP:    break;
120 			case Shader::OPCODE_MOV: MOV(d, s0);         break;
121 			case Shader::OPCODE_ADD: ADD(d, s0, s1);     break;
122 			case Shader::OPCODE_SUB: SUB(d, s0, s1);     break;
123 			case Shader::OPCODE_MAD: MAD(d, s0, s1, s2); break;
124 			case Shader::OPCODE_MUL: MUL(d, s0, s1);     break;
125 			case Shader::OPCODE_DP3: DP3(d, s0, s1);     break;
126 			case Shader::OPCODE_DP4: DP4(d, s0, s1);     break;
127 			case Shader::OPCODE_LRP: LRP(d, s0, s1, s2); break;
128 			case Shader::OPCODE_TEXCOORD:
129 				if(version < 0x0104)
130 				{
131 					TEXCOORD(d, x, y, z, dst.index);
132 			}
133 				else
134 				{
135 					if((src0.swizzle & 0x30) == 0x20)   // .xyz
136 					{
137 						TEXCRD(d, x, y, z, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
138 					}
139 					else   // .xwy
140 					{
141 						TEXCRD(d, x, y, w, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
142 					}
143 				}
144 				break;
145 			case Shader::OPCODE_TEXKILL:
146 				if(version < 0x0104)
147 				{
148 					TEXKILL(cMask, x, y, z);
149 				}
150 				else if(version == 0x0104)
151 				{
152 					if(dst.type == Shader::PARAMETER_TEXTURE)
153 					{
154 						TEXKILL(cMask, x, y, z);
155 					}
156 					else
157 					{
158 						TEXKILL(cMask, rs[dst.index]);
159 					}
160 				}
161 				else ASSERT(false);
162 				break;
163 			case Shader::OPCODE_TEX:
164 				if(version < 0x0104)
165 				{
166 					TEX(d, x, y, z, dst.index, false);
167 				}
168 				else if(version == 0x0104)
169 				{
170 					if(src0.type == Shader::PARAMETER_TEXTURE)
171 					{
172 						if((src0.swizzle & 0x30) == 0x20)   // .xyz
173 						{
174 							TEX(d, x, y, z, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
175 						}
176 						else   // .xyw
177 						{
178 							TEX(d, x, y, w, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
179 						}
180 					}
181 					else
182 					{
183 						TEXLD(d, s0, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
184 					}
185 				}
186 				else ASSERT(false);
187 				break;
188 			case Shader::OPCODE_TEXBEM:       TEXBEM(d, s0, x, y, z, dst.index);                                             break;
189 			case Shader::OPCODE_TEXBEML:      TEXBEML(d, s0, x, y, z, dst.index);                                            break;
190 			case Shader::OPCODE_TEXREG2AR:    TEXREG2AR(d, s0, dst.index);                                                   break;
191 			case Shader::OPCODE_TEXREG2GB:    TEXREG2GB(d, s0, dst.index);                                                   break;
192 			case Shader::OPCODE_TEXM3X2PAD:   TEXM3X2PAD(x, y, z, s0, 0, src0.modifier == Shader::MODIFIER_SIGN);            break;
193 			case Shader::OPCODE_TEXM3X2TEX:   TEXM3X2TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
194 			case Shader::OPCODE_TEXM3X3PAD:   TEXM3X3PAD(x, y, z, s0, pad++ % 2, src0.modifier == Shader::MODIFIER_SIGN);    break;
195 			case Shader::OPCODE_TEXM3X3TEX:   TEXM3X3TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
196 			case Shader::OPCODE_TEXM3X3SPEC:  TEXM3X3SPEC(d, x, y, z, dst.index, s0, s1);                                    break;
197 			case Shader::OPCODE_TEXM3X3VSPEC: TEXM3X3VSPEC(d, x, y, z, dst.index, s0);                                       break;
198 			case Shader::OPCODE_CND:          CND(d, s0, s1, s2);                                                            break;
199 			case Shader::OPCODE_TEXREG2RGB:   TEXREG2RGB(d, s0, dst.index);                                                  break;
200 			case Shader::OPCODE_TEXDP3TEX:    TEXDP3TEX(d, x, y, z, dst.index, s0);                                          break;
201 			case Shader::OPCODE_TEXM3X2DEPTH: TEXM3X2DEPTH(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN);          break;
202 			case Shader::OPCODE_TEXDP3:       TEXDP3(d, x, y, z, s0);                                                        break;
203 			case Shader::OPCODE_TEXM3X3:      TEXM3X3(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN);               break;
204 			case Shader::OPCODE_TEXDEPTH:     TEXDEPTH();                                                                    break;
205 			case Shader::OPCODE_CMP0:         CMP(d, s0, s1, s2);                                                            break;
206 			case Shader::OPCODE_BEM:          BEM(d, s0, s1, dst.index);                                                     break;
207 			case Shader::OPCODE_PHASE:                                                                                       break;
208 			case Shader::OPCODE_END:                                                                                         break;
209 			default:
210 				ASSERT(false);
211 			}
212 
213 			if(dst.type != Shader::PARAMETER_VOID && opcode != Shader::OPCODE_TEXKILL)
214 			{
215 				if(dst.shift > 0)
216 				{
217 					if(dst.mask & 0x1) { d.x = AddSat(d.x, d.x); if(dst.shift > 1) d.x = AddSat(d.x, d.x); if(dst.shift > 2) d.x = AddSat(d.x, d.x); }
218 					if(dst.mask & 0x2) { d.y = AddSat(d.y, d.y); if(dst.shift > 1) d.y = AddSat(d.y, d.y); if(dst.shift > 2) d.y = AddSat(d.y, d.y); }
219 					if(dst.mask & 0x4) { d.z = AddSat(d.z, d.z); if(dst.shift > 1) d.z = AddSat(d.z, d.z); if(dst.shift > 2) d.z = AddSat(d.z, d.z); }
220 					if(dst.mask & 0x8) { d.w = AddSat(d.w, d.w); if(dst.shift > 1) d.w = AddSat(d.w, d.w); if(dst.shift > 2) d.w = AddSat(d.w, d.w); }
221 				}
222 				else if(dst.shift < 0)
223 				{
224 					if(dst.mask & 0x1) d.x = d.x >> -dst.shift;
225 					if(dst.mask & 0x2) d.y = d.y >> -dst.shift;
226 					if(dst.mask & 0x4) d.z = d.z >> -dst.shift;
227 					if(dst.mask & 0x8) d.w = d.w >> -dst.shift;
228 				}
229 
230 				if(dst.saturate)
231 				{
232 					if(dst.mask & 0x1) { d.x = Min(d.x, Short4(0x1000)); d.x = Max(d.x, Short4(0x0000, 0x0000, 0x0000, 0x0000)); }
233 					if(dst.mask & 0x2) { d.y = Min(d.y, Short4(0x1000)); d.y = Max(d.y, Short4(0x0000, 0x0000, 0x0000, 0x0000)); }
234 					if(dst.mask & 0x4) { d.z = Min(d.z, Short4(0x1000)); d.z = Max(d.z, Short4(0x0000, 0x0000, 0x0000, 0x0000)); }
235 					if(dst.mask & 0x8) { d.w = Min(d.w, Short4(0x1000)); d.w = Max(d.w, Short4(0x0000, 0x0000, 0x0000, 0x0000)); }
236 				}
237 
238 				if(pairing)
239 				{
240 					if(dst.mask & 0x1) dPairing.x = d.x;
241 					if(dst.mask & 0x2) dPairing.y = d.y;
242 					if(dst.mask & 0x4) dPairing.z = d.z;
243 					if(dst.mask & 0x8) dPairing.w = d.w;
244 				}
245 
246 				if(coissue)
247 				{
248 					const Dst &dst = shader->getInstruction(i - 1)->dst;
249 
250 					writeDestination(dPairing, dst);
251 				}
252 
253 				if(!pairing)
254 				{
255 					writeDestination(d, dst);
256 				}
257 			}
258 		}
259 	}
260 
alphaTest(Int cMask[4])261 	Bool PixelPipeline::alphaTest(Int cMask[4])
262 	{
263 		current.x = Min(current.x, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.x = Max(current.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));
264 		current.y = Min(current.y, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.y = Max(current.y, Short4(0x0000, 0x0000, 0x0000, 0x0000));
265 		current.z = Min(current.z, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.z = Max(current.z, Short4(0x0000, 0x0000, 0x0000, 0x0000));
266 		current.w = Min(current.w, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.w = Max(current.w, Short4(0x0000, 0x0000, 0x0000, 0x0000));
267 
268 		if(!state.alphaTestActive())
269 		{
270 			return true;
271 		}
272 
273 		Int aMask;
274 
275 		if(state.transparencyAntialiasing == TRANSPARENCY_NONE)
276 		{
277 			PixelRoutine::alphaTest(aMask, current.w);
278 
279 			for(unsigned int q = 0; q < state.multiSample; q++)
280 			{
281 				cMask[q] &= aMask;
282 			}
283 		}
284 		else if(state.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
285 		{
286 			Float4 alpha = Float4(current.w) * Float4(1.0f / 0x1000);
287 
288 			alphaToCoverage(cMask, alpha);
289 		}
290 		else ASSERT(false);
291 
292 		Int pass = cMask[0];
293 
294 		for(unsigned int q = 1; q < state.multiSample; q++)
295 		{
296 			pass = pass | cMask[q];
297 		}
298 
299 		return pass != 0x0;
300 	}
301 
rasterOperation(Float4 & fog,Pointer<Byte> cBuffer[4],Int & x,Int sMask[4],Int zMask[4],Int cMask[4])302 	void PixelPipeline::rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
303 	{
304 		if(!state.colorWriteActive(0))
305 		{
306 			return;
307 		}
308 
309 		Vector4f oC;
310 
311 		switch(state.targetFormat[0])
312 		{
313 		case FORMAT_R5G6B5:
314 		case FORMAT_X8R8G8B8:
315 		case FORMAT_X8B8G8R8:
316 		case FORMAT_A8R8G8B8:
317 		case FORMAT_A8B8G8R8:
318 		case FORMAT_A8:
319 		case FORMAT_G16R16:
320 		case FORMAT_A16B16G16R16:
321 			if(!postBlendSRGB && state.writeSRGB)
322 			{
323 				linearToSRGB12_16(current);
324 			}
325 			else
326 			{
327 				current.x <<= 4;
328 				current.y <<= 4;
329 				current.z <<= 4;
330 				current.w <<= 4;
331 			}
332 
333 			if(state.targetFormat[0] == FORMAT_R5G6B5)
334 			{
335 				current.x &= Short4(0xF800u);
336 				current.y &= Short4(0xFC00u);
337 				current.z &= Short4(0xF800u);
338 			}
339 
340 			fogBlend(current, fog);
341 
342 			for(unsigned int q = 0; q < state.multiSample; q++)
343 			{
344 				Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
345 				Vector4s color = current;
346 
347 				if(state.multiSampleMask & (1 << q))
348 				{
349 					alphaBlend(0, buffer, color, x);
350 					logicOperation(0, buffer, color, x);
351 					writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
352 				}
353 			}
354 			break;
355 		case FORMAT_R32F:
356 		case FORMAT_G32R32F:
357 		case FORMAT_X32B32G32R32F:
358 		case FORMAT_A32B32G32R32F:
359 			convertSigned12(oC, current);
360 			PixelRoutine::fogBlend(oC, fog);
361 
362 			for(unsigned int q = 0; q < state.multiSample; q++)
363 			{
364 				Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
365 				Vector4f color = oC;
366 
367 				if(state.multiSampleMask & (1 << q))
368 				{
369 					alphaBlend(0, buffer, color, x);
370 					writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
371 				}
372 			}
373 			break;
374 		default:
375 			ASSERT(false);
376 		}
377 	}
378 
blendTexture(Vector4s & temp,Vector4s & texture,int stage)379 	void PixelPipeline::blendTexture(Vector4s &temp, Vector4s &texture, int stage)
380 	{
381 		Vector4s *arg1;
382 		Vector4s *arg2;
383 		Vector4s *arg3;
384 		Vector4s res;
385 
386 		Vector4s constant;
387 		Vector4s tfactor;
388 
389 		const TextureStage::State &textureStage = state.textureStage[stage];
390 
391 		if(textureStage.firstArgument == TextureStage::SOURCE_CONSTANT ||
392 		   textureStage.firstArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
393 		   textureStage.secondArgument == TextureStage::SOURCE_CONSTANT ||
394 		   textureStage.secondArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
395 		   textureStage.thirdArgument == TextureStage::SOURCE_CONSTANT ||
396 		   textureStage.thirdArgumentAlpha == TextureStage::SOURCE_CONSTANT)
397 		{
398 			constant.x = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[0]));
399 			constant.y = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[1]));
400 			constant.z = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[2]));
401 			constant.w = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[3]));
402 		}
403 
404 		if(textureStage.firstArgument == TextureStage::SOURCE_TFACTOR ||
405 		   textureStage.firstArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
406 		   textureStage.secondArgument == TextureStage::SOURCE_TFACTOR ||
407 		   textureStage.secondArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
408 		   textureStage.thirdArgument == TextureStage::SOURCE_TFACTOR ||
409 		   textureStage.thirdArgumentAlpha == TextureStage::SOURCE_TFACTOR)
410 		{
411 			tfactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[0]));
412 			tfactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[1]));
413 			tfactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[2]));
414 			tfactor.w = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]));
415 		}
416 
417 		// Premodulate
418 		if(stage > 0 && textureStage.usesTexture)
419 		{
420 			if(state.textureStage[stage - 1].stageOperation == TextureStage::STAGE_PREMODULATE)
421 			{
422 				current.x = MulHigh(current.x, texture.x) << 4;
423 				current.y = MulHigh(current.y, texture.y) << 4;
424 				current.z = MulHigh(current.z, texture.z) << 4;
425 			}
426 
427 			if(state.textureStage[stage - 1].stageOperationAlpha == TextureStage::STAGE_PREMODULATE)
428 			{
429 				current.w = MulHigh(current.w, texture.w) << 4;
430 			}
431 		}
432 
433 		if(luminance)
434 		{
435 			texture.x = MulHigh(texture.x, L) << 4;
436 			texture.y = MulHigh(texture.y, L) << 4;
437 			texture.z = MulHigh(texture.z, L) << 4;
438 
439 			luminance = false;
440 		}
441 
442 		switch(textureStage.firstArgument)
443 		{
444 		case TextureStage::SOURCE_TEXTURE:	arg1 = &texture;    break;
445 		case TextureStage::SOURCE_CONSTANT:	arg1 = &constant;   break;
446 		case TextureStage::SOURCE_CURRENT:	arg1 = &current;  break;
447 		case TextureStage::SOURCE_DIFFUSE:	arg1 = &diffuse;  break;
448 		case TextureStage::SOURCE_SPECULAR:	arg1 = &specular; break;
449 		case TextureStage::SOURCE_TEMP:		arg1 = &temp;       break;
450 		case TextureStage::SOURCE_TFACTOR:	arg1 = &tfactor;    break;
451 		default:
452 			ASSERT(false);
453 		}
454 
455 		switch(textureStage.secondArgument)
456 		{
457 		case TextureStage::SOURCE_TEXTURE:	arg2 = &texture;    break;
458 		case TextureStage::SOURCE_CONSTANT:	arg2 = &constant;   break;
459 		case TextureStage::SOURCE_CURRENT:	arg2 = &current;  break;
460 		case TextureStage::SOURCE_DIFFUSE:	arg2 = &diffuse;  break;
461 		case TextureStage::SOURCE_SPECULAR:	arg2 = &specular; break;
462 		case TextureStage::SOURCE_TEMP:		arg2 = &temp;       break;
463 		case TextureStage::SOURCE_TFACTOR:	arg2 = &tfactor;    break;
464 		default:
465 			ASSERT(false);
466 		}
467 
468 		switch(textureStage.thirdArgument)
469 		{
470 		case TextureStage::SOURCE_TEXTURE:	arg3 = &texture;    break;
471 		case TextureStage::SOURCE_CONSTANT:	arg3 = &constant;   break;
472 		case TextureStage::SOURCE_CURRENT:	arg3 = &current;  break;
473 		case TextureStage::SOURCE_DIFFUSE:	arg3 = &diffuse;  break;
474 		case TextureStage::SOURCE_SPECULAR:	arg3 = &specular; break;
475 		case TextureStage::SOURCE_TEMP:		arg3 = &temp;       break;
476 		case TextureStage::SOURCE_TFACTOR:	arg3 = &tfactor;    break;
477 		default:
478 			ASSERT(false);
479 		}
480 
481 		Vector4s mod1;
482 		Vector4s mod2;
483 		Vector4s mod3;
484 
485 		switch(textureStage.firstModifier)
486 		{
487 		case TextureStage::MODIFIER_COLOR:
488 			break;
489 		case TextureStage::MODIFIER_INVCOLOR:
490 			mod1.x = SubSat(Short4(0x1000), arg1->x);
491 			mod1.y = SubSat(Short4(0x1000), arg1->y);
492 			mod1.z = SubSat(Short4(0x1000), arg1->z);
493 			mod1.w = SubSat(Short4(0x1000), arg1->w);
494 
495 			arg1 = &mod1;
496 			break;
497 		case TextureStage::MODIFIER_ALPHA:
498 			mod1.x = arg1->w;
499 			mod1.y = arg1->w;
500 			mod1.z = arg1->w;
501 			mod1.w = arg1->w;
502 
503 			arg1 = &mod1;
504 			break;
505 		case TextureStage::MODIFIER_INVALPHA:
506 			mod1.x = SubSat(Short4(0x1000), arg1->w);
507 			mod1.y = SubSat(Short4(0x1000), arg1->w);
508 			mod1.z = SubSat(Short4(0x1000), arg1->w);
509 			mod1.w = SubSat(Short4(0x1000), arg1->w);
510 
511 			arg1 = &mod1;
512 			break;
513 		default:
514 			ASSERT(false);
515 		}
516 
517 		switch(textureStage.secondModifier)
518 		{
519 		case TextureStage::MODIFIER_COLOR:
520 			break;
521 		case TextureStage::MODIFIER_INVCOLOR:
522 			mod2.x = SubSat(Short4(0x1000), arg2->x);
523 			mod2.y = SubSat(Short4(0x1000), arg2->y);
524 			mod2.z = SubSat(Short4(0x1000), arg2->z);
525 			mod2.w = SubSat(Short4(0x1000), arg2->w);
526 
527 			arg2 = &mod2;
528 			break;
529 		case TextureStage::MODIFIER_ALPHA:
530 			mod2.x = arg2->w;
531 			mod2.y = arg2->w;
532 			mod2.z = arg2->w;
533 			mod2.w = arg2->w;
534 
535 			arg2 = &mod2;
536 			break;
537 		case TextureStage::MODIFIER_INVALPHA:
538 			mod2.x = SubSat(Short4(0x1000), arg2->w);
539 			mod2.y = SubSat(Short4(0x1000), arg2->w);
540 			mod2.z = SubSat(Short4(0x1000), arg2->w);
541 			mod2.w = SubSat(Short4(0x1000), arg2->w);
542 
543 			arg2 = &mod2;
544 			break;
545 		default:
546 			ASSERT(false);
547 		}
548 
549 		switch(textureStage.thirdModifier)
550 		{
551 		case TextureStage::MODIFIER_COLOR:
552 			break;
553 		case TextureStage::MODIFIER_INVCOLOR:
554 			mod3.x = SubSat(Short4(0x1000), arg3->x);
555 			mod3.y = SubSat(Short4(0x1000), arg3->y);
556 			mod3.z = SubSat(Short4(0x1000), arg3->z);
557 			mod3.w = SubSat(Short4(0x1000), arg3->w);
558 
559 			arg3 = &mod3;
560 			break;
561 		case TextureStage::MODIFIER_ALPHA:
562 			mod3.x = arg3->w;
563 			mod3.y = arg3->w;
564 			mod3.z = arg3->w;
565 			mod3.w = arg3->w;
566 
567 			arg3 = &mod3;
568 			break;
569 		case TextureStage::MODIFIER_INVALPHA:
570 			mod3.x = SubSat(Short4(0x1000), arg3->w);
571 			mod3.y = SubSat(Short4(0x1000), arg3->w);
572 			mod3.z = SubSat(Short4(0x1000), arg3->w);
573 			mod3.w = SubSat(Short4(0x1000), arg3->w);
574 
575 			arg3 = &mod3;
576 			break;
577 		default:
578 			ASSERT(false);
579 		}
580 
581 		switch(textureStage.stageOperation)
582 		{
583 		case TextureStage::STAGE_DISABLE:
584 			break;
585 		case TextureStage::STAGE_SELECTARG1: // Arg1
586 			res.x = arg1->x;
587 			res.y = arg1->y;
588 			res.z = arg1->z;
589 			break;
590 		case TextureStage::STAGE_SELECTARG2: // Arg2
591 			res.x = arg2->x;
592 			res.y = arg2->y;
593 			res.z = arg2->z;
594 			break;
595 		case TextureStage::STAGE_SELECTARG3: // Arg3
596 			res.x = arg3->x;
597 			res.y = arg3->y;
598 			res.z = arg3->z;
599 			break;
600 		case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
601 			res.x = MulHigh(arg1->x, arg2->x) << 4;
602 			res.y = MulHigh(arg1->y, arg2->y) << 4;
603 			res.z = MulHigh(arg1->z, arg2->z) << 4;
604 			break;
605 		case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
606 			res.x = MulHigh(arg1->x, arg2->x) << 5;
607 			res.y = MulHigh(arg1->y, arg2->y) << 5;
608 			res.z = MulHigh(arg1->z, arg2->z) << 5;
609 			break;
610 		case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
611 			res.x = MulHigh(arg1->x, arg2->x) << 6;
612 			res.y = MulHigh(arg1->y, arg2->y) << 6;
613 			res.z = MulHigh(arg1->z, arg2->z) << 6;
614 			break;
615 		case TextureStage::STAGE_ADD: // Arg1 + Arg2
616 			res.x = AddSat(arg1->x, arg2->x);
617 			res.y = AddSat(arg1->y, arg2->y);
618 			res.z = AddSat(arg1->z, arg2->z);
619 			break;
620 		case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
621 			res.x = AddSat(arg1->x, arg2->x);
622 			res.y = AddSat(arg1->y, arg2->y);
623 			res.z = AddSat(arg1->z, arg2->z);
624 
625 			res.x = SubSat(res.x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
626 			res.y = SubSat(res.y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
627 			res.z = SubSat(res.z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
628 			break;
629 		case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
630 			res.x = AddSat(arg1->x, arg2->x);
631 			res.y = AddSat(arg1->y, arg2->y);
632 			res.z = AddSat(arg1->z, arg2->z);
633 
634 			res.x = SubSat(res.x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
635 			res.y = SubSat(res.y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
636 			res.z = SubSat(res.z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
637 
638 			res.x = AddSat(res.x, res.x);
639 			res.y = AddSat(res.y, res.y);
640 			res.z = AddSat(res.z, res.z);
641 			break;
642 		case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
643 			res.x = SubSat(arg1->x, arg2->x);
644 			res.y = SubSat(arg1->y, arg2->y);
645 			res.z = SubSat(arg1->z, arg2->z);
646 			break;
647 		case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
648 			{
649 				Short4 tmp;
650 
651 				tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(arg1->x, arg2->x); res.x = SubSat(res.x, tmp);
652 				tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(arg1->y, arg2->y); res.y = SubSat(res.y, tmp);
653 				tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(arg1->z, arg2->z); res.z = SubSat(res.z, tmp);
654 			}
655 			break;
656 		case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
657 			res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg3->x);
658 			res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg3->y);
659 			res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg3->z);
660 			break;
661 		case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
662 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, arg3->x) << 4; res.x = AddSat(res.x, arg2->x);
663 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, arg3->y) << 4; res.y = AddSat(res.y, arg2->y);
664 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, arg3->z) << 4; res.z = AddSat(res.z, arg2->z);
665 			break;
666 		case TextureStage::STAGE_DOT3: // 2 * (Arg1.x - 0.5) * 2 * (Arg2.x - 0.5) + 2 * (Arg1.y - 0.5) * 2 * (Arg2.y - 0.5) + 2 * (Arg1.z - 0.5) * 2 * (Arg2.z - 0.5)
667 			{
668 				Short4 tmp;
669 
670 				res.x = SubSat(arg1->x, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->x, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.x = MulHigh(res.x, tmp);
671 				res.y = SubSat(arg1->y, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->y, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.y = MulHigh(res.y, tmp);
672 				res.z = SubSat(arg1->z, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->z, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.z = MulHigh(res.z, tmp);
673 
674 				res.x = res.x << 6;
675 				res.y = res.y << 6;
676 				res.z = res.z << 6;
677 
678 				res.x = AddSat(res.x, res.y);
679 				res.x = AddSat(res.x, res.z);
680 
681 				// Clamp to [0, 1]
682 				res.x = Max(res.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));
683 				res.x = Min(res.x, Short4(0x1000));
684 
685 				res.y = res.x;
686 				res.z = res.x;
687 				res.w = res.x;
688 			}
689 			break;
690 		case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
691 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, current.w) << 4; res.x = AddSat(res.x, arg2->x);
692 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, current.w) << 4; res.y = AddSat(res.y, arg2->y);
693 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, current.w) << 4; res.z = AddSat(res.z, arg2->z);
694 			break;
695 		case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Alpha * (Arg1 - Arg2) + Arg2
696 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, diffuse.w) << 4; res.x = AddSat(res.x, arg2->x);
697 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, diffuse.w) << 4; res.y = AddSat(res.y, arg2->y);
698 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, diffuse.w) << 4; res.z = AddSat(res.z, arg2->z);
699 			break;
700 		case TextureStage::STAGE_BLENDFACTORALPHA: // Alpha * (Arg1 - Arg2) + Arg2
701 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.x = AddSat(res.x, arg2->x);
702 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.y = AddSat(res.y, arg2->y);
703 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.z = AddSat(res.z, arg2->z);
704 			break;
705 		case TextureStage::STAGE_BLENDTEXTUREALPHA: // Alpha * (Arg1 - Arg2) + Arg2
706 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, texture.w) << 4; res.x = AddSat(res.x, arg2->x);
707 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, texture.w) << 4; res.y = AddSat(res.y, arg2->y);
708 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, texture.w) << 4; res.z = AddSat(res.z, arg2->z);
709 			break;
710 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
711 			res.x = SubSat(Short4(0x1000), texture.w); res.x = MulHigh(res.x, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
712 			res.y = SubSat(Short4(0x1000), texture.w); res.y = MulHigh(res.y, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
713 			res.z = SubSat(Short4(0x1000), texture.w); res.z = MulHigh(res.z, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
714 			break;
715 		case TextureStage::STAGE_PREMODULATE:
716 			res.x = arg1->x;
717 			res.y = arg1->y;
718 			res.z = arg1->z;
719 			break;
720 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR: // Arg1 + Arg1.w * Arg2
721 			res.x = MulHigh(arg1->w, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
722 			res.y = MulHigh(arg1->w, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
723 			res.z = MulHigh(arg1->w, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
724 			break;
725 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA: // Arg1 * Arg2 + Arg1.w
726 			res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg1->w);
727 			res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg1->w);
728 			res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg1->w);
729 			break;
730 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR: // (1 - Arg1.w) * Arg2 + Arg1
731 			{
732 				Short4 tmp;
733 
734 				res.x = AddSat(arg1->x, arg2->x); tmp = MulHigh(arg1->w, arg2->x) << 4; res.x = SubSat(res.x, tmp);
735 				res.y = AddSat(arg1->y, arg2->y); tmp = MulHigh(arg1->w, arg2->y) << 4; res.y = SubSat(res.y, tmp);
736 				res.z = AddSat(arg1->z, arg2->z); tmp = MulHigh(arg1->w, arg2->z) << 4; res.z = SubSat(res.z, tmp);
737 			}
738 			break;
739 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA: // (1 - Arg1) * Arg2 + Arg1.w
740 			{
741 				Short4 tmp;
742 
743 				res.x = AddSat(arg1->w, arg2->x); tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = SubSat(res.x, tmp);
744 				res.y = AddSat(arg1->w, arg2->y); tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = SubSat(res.y, tmp);
745 				res.z = AddSat(arg1->w, arg2->z); tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = SubSat(res.z, tmp);
746 			}
747 			break;
748 		case TextureStage::STAGE_BUMPENVMAP:
749 			{
750 				du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
751 				dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
752 
753 				Float4 du2;
754 				Float4 dv2;
755 
756 				du2 = du;
757 				dv2 = dv;
758 				du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
759 				dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
760 				du += dv2;
761 				dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
762 				du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
763 				dv += du2;
764 
765 				perturbate = true;
766 
767 				res.x = current.x;
768 				res.y = current.y;
769 				res.z = current.z;
770 				res.w = current.w;
771 			}
772 			break;
773 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
774 			{
775 				du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
776 				dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
777 
778 				Float4 du2;
779 				Float4 dv2;
780 
781 				du2 = du;
782 				dv2 = dv;
783 
784 				du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
785 				dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
786 				du += dv2;
787 				dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
788 				du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
789 				dv += du2;
790 
791 				perturbate = true;
792 
793 				L = texture.z;
794 				L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
795 				L = L << 4;
796 				L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
797 				L = Max(L, Short4(0x0000, 0x0000, 0x0000, 0x0000));
798 				L = Min(L, Short4(0x1000));
799 
800 				luminance = true;
801 
802 				res.x = current.x;
803 				res.y = current.y;
804 				res.z = current.z;
805 				res.w = current.w;
806 			}
807 			break;
808 		default:
809 			ASSERT(false);
810 		}
811 
812 		if(textureStage.stageOperation != TextureStage::STAGE_DOT3)
813 		{
814 			switch(textureStage.firstArgumentAlpha)
815 			{
816 			case TextureStage::SOURCE_TEXTURE:	arg1 = &texture;		break;
817 			case TextureStage::SOURCE_CONSTANT:	arg1 = &constant;		break;
818 			case TextureStage::SOURCE_CURRENT:	arg1 = &current;		break;
819 			case TextureStage::SOURCE_DIFFUSE:	arg1 = &diffuse;		break;
820 			case TextureStage::SOURCE_SPECULAR:	arg1 = &specular;		break;
821 			case TextureStage::SOURCE_TEMP:		arg1 = &temp;			break;
822 			case TextureStage::SOURCE_TFACTOR:	arg1 = &tfactor;		break;
823 			default:
824 				ASSERT(false);
825 			}
826 
827 			switch(textureStage.secondArgumentAlpha)
828 			{
829 			case TextureStage::SOURCE_TEXTURE:	arg2 = &texture;		break;
830 			case TextureStage::SOURCE_CONSTANT:	arg2 = &constant;		break;
831 			case TextureStage::SOURCE_CURRENT:	arg2 = &current;		break;
832 			case TextureStage::SOURCE_DIFFUSE:	arg2 = &diffuse;		break;
833 			case TextureStage::SOURCE_SPECULAR:	arg2 = &specular;		break;
834 			case TextureStage::SOURCE_TEMP:		arg2 = &temp;			break;
835 			case TextureStage::SOURCE_TFACTOR:	arg2 = &tfactor;		break;
836 			default:
837 				ASSERT(false);
838 			}
839 
840 			switch(textureStage.thirdArgumentAlpha)
841 			{
842 			case TextureStage::SOURCE_TEXTURE:	arg3 = &texture;		break;
843 			case TextureStage::SOURCE_CONSTANT:	arg3 = &constant;		break;
844 			case TextureStage::SOURCE_CURRENT:	arg3 = &current;		break;
845 			case TextureStage::SOURCE_DIFFUSE:	arg3 = &diffuse;		break;
846 			case TextureStage::SOURCE_SPECULAR:	arg3 = &specular;		break;
847 			case TextureStage::SOURCE_TEMP:		arg3 = &temp;			break;
848 			case TextureStage::SOURCE_TFACTOR:	arg3 = &tfactor;		break;
849 			default:
850 				ASSERT(false);
851 			}
852 
853 			switch(textureStage.firstModifierAlpha)   // FIXME: Check if actually used
854 			{
855 			case TextureStage::MODIFIER_COLOR:
856 				break;
857 			case TextureStage::MODIFIER_INVCOLOR:
858 				mod1.w = SubSat(Short4(0x1000), arg1->w);
859 
860 				arg1 = &mod1;
861 				break;
862 			case TextureStage::MODIFIER_ALPHA:
863 				// Redudant
864 				break;
865 			case TextureStage::MODIFIER_INVALPHA:
866 				mod1.w = SubSat(Short4(0x1000), arg1->w);
867 
868 				arg1 = &mod1;
869 				break;
870 			default:
871 				ASSERT(false);
872 			}
873 
874 			switch(textureStage.secondModifierAlpha)   // FIXME: Check if actually used
875 			{
876 			case TextureStage::MODIFIER_COLOR:
877 				break;
878 			case TextureStage::MODIFIER_INVCOLOR:
879 				mod2.w = SubSat(Short4(0x1000), arg2->w);
880 
881 				arg2 = &mod2;
882 				break;
883 			case TextureStage::MODIFIER_ALPHA:
884 				// Redudant
885 				break;
886 			case TextureStage::MODIFIER_INVALPHA:
887 				mod2.w = SubSat(Short4(0x1000), arg2->w);
888 
889 				arg2 = &mod2;
890 				break;
891 			default:
892 				ASSERT(false);
893 			}
894 
895 			switch(textureStage.thirdModifierAlpha)   // FIXME: Check if actually used
896 			{
897 			case TextureStage::MODIFIER_COLOR:
898 				break;
899 			case TextureStage::MODIFIER_INVCOLOR:
900 				mod3.w = SubSat(Short4(0x1000), arg3->w);
901 
902 				arg3 = &mod3;
903 				break;
904 			case TextureStage::MODIFIER_ALPHA:
905 				// Redudant
906 				break;
907 			case TextureStage::MODIFIER_INVALPHA:
908 				mod3.w = SubSat(Short4(0x1000), arg3->w);
909 
910 				arg3 = &mod3;
911 				break;
912 			default:
913 				ASSERT(false);
914 			}
915 
916 			switch(textureStage.stageOperationAlpha)
917 			{
918 			case TextureStage::STAGE_DISABLE:
919 				break;
920 			case TextureStage::STAGE_SELECTARG1: // Arg1
921 				res.w = arg1->w;
922 				break;
923 			case TextureStage::STAGE_SELECTARG2: // Arg2
924 				res.w = arg2->w;
925 				break;
926 			case TextureStage::STAGE_SELECTARG3: // Arg3
927 				res.w = arg3->w;
928 				break;
929 			case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
930 				res.w = MulHigh(arg1->w, arg2->w) << 4;
931 				break;
932 			case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
933 				res.w = MulHigh(arg1->w, arg2->w) << 5;
934 				break;
935 			case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
936 				res.w = MulHigh(arg1->w, arg2->w) << 6;
937 				break;
938 			case TextureStage::STAGE_ADD: // Arg1 + Arg2
939 				res.w = AddSat(arg1->w, arg2->w);
940 				break;
941 			case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
942 				res.w = AddSat(arg1->w, arg2->w);
943 				res.w = SubSat(res.w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
944 				break;
945 			case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
946 				res.w = AddSat(arg1->w, arg2->w);
947 				res.w = SubSat(res.w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
948 				res.w = AddSat(res.w, res.w);
949 				break;
950 			case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
951 				res.w = SubSat(arg1->w, arg2->w);
952 				break;
953 			case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
954 				{
955 					Short4 tmp;
956 
957 					tmp = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(arg1->w, arg2->w); res.w = SubSat(res.w, tmp);
958 				}
959 				break;
960 			case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
961 				res.w = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(res.w, arg3->w);
962 				break;
963 			case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
964 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, arg3->w) << 4; res.w = AddSat(res.w, arg2->w);
965 				break;
966 			case TextureStage::STAGE_DOT3:
967 				break;   // Already computed in color channel
968 			case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
969 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, current.w) << 4; res.w = AddSat(res.w, arg2->w);
970 				break;
971 			case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
972 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, diffuse.w) << 4; res.w = AddSat(res.w, arg2->w);
973 				break;
974 			case TextureStage::STAGE_BLENDFACTORALPHA:
975 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.w = AddSat(res.w, arg2->w);
976 				break;
977 			case TextureStage::STAGE_BLENDTEXTUREALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
978 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, texture.w) << 4; res.w = AddSat(res.w, arg2->w);
979 				break;
980 			case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
981 				res.w = SubSat(Short4(0x1000), texture.w); res.w = MulHigh(res.w, arg2->w) << 4; res.w = AddSat(res.w, arg1->w);
982 				break;
983 			case TextureStage::STAGE_PREMODULATE:
984 				res.w = arg1->w;
985 				break;
986 			case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
987 			case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
988 			case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
989 			case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
990 			case TextureStage::STAGE_BUMPENVMAP:
991 			case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
992 				break;   // Invalid alpha operations
993 			default:
994 				ASSERT(false);
995 			}
996 		}
997 
998 		// Clamp result to [0, 1]
999 
1000 		switch(textureStage.stageOperation)
1001 		{
1002 		case TextureStage::STAGE_DISABLE:
1003 		case TextureStage::STAGE_SELECTARG1:
1004 		case TextureStage::STAGE_SELECTARG2:
1005 		case TextureStage::STAGE_SELECTARG3:
1006 		case TextureStage::STAGE_MODULATE:
1007 		case TextureStage::STAGE_MODULATE2X:
1008 		case TextureStage::STAGE_MODULATE4X:
1009 		case TextureStage::STAGE_ADD:
1010 		case TextureStage::STAGE_MULTIPLYADD:
1011 		case TextureStage::STAGE_LERP:
1012 		case TextureStage::STAGE_BLENDCURRENTALPHA:
1013 		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1014 		case TextureStage::STAGE_BLENDFACTORALPHA:
1015 		case TextureStage::STAGE_BLENDTEXTUREALPHA:
1016 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1017 		case TextureStage::STAGE_DOT3:   // Already clamped
1018 		case TextureStage::STAGE_PREMODULATE:
1019 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1020 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1021 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1022 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1023 		case TextureStage::STAGE_BUMPENVMAP:
1024 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1025 			if(state.textureStage[stage].cantUnderflow)
1026 			{
1027 				break;   // Can't go below zero
1028 			}
1029 		case TextureStage::STAGE_ADDSIGNED:
1030 		case TextureStage::STAGE_ADDSIGNED2X:
1031 		case TextureStage::STAGE_SUBTRACT:
1032 		case TextureStage::STAGE_ADDSMOOTH:
1033 			res.x = Max(res.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1034 			res.y = Max(res.y, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1035 			res.z = Max(res.z, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1036 			break;
1037 		default:
1038 			ASSERT(false);
1039 		}
1040 
1041 		switch(textureStage.stageOperationAlpha)
1042 		{
1043 		case TextureStage::STAGE_DISABLE:
1044 		case TextureStage::STAGE_SELECTARG1:
1045 		case TextureStage::STAGE_SELECTARG2:
1046 		case TextureStage::STAGE_SELECTARG3:
1047 		case TextureStage::STAGE_MODULATE:
1048 		case TextureStage::STAGE_MODULATE2X:
1049 		case TextureStage::STAGE_MODULATE4X:
1050 		case TextureStage::STAGE_ADD:
1051 		case TextureStage::STAGE_MULTIPLYADD:
1052 		case TextureStage::STAGE_LERP:
1053 		case TextureStage::STAGE_BLENDCURRENTALPHA:
1054 		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1055 		case TextureStage::STAGE_BLENDFACTORALPHA:
1056 		case TextureStage::STAGE_BLENDTEXTUREALPHA:
1057 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1058 		case TextureStage::STAGE_DOT3:   // Already clamped
1059 		case TextureStage::STAGE_PREMODULATE:
1060 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1061 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1062 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1063 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1064 		case TextureStage::STAGE_BUMPENVMAP:
1065 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1066 			if(state.textureStage[stage].cantUnderflow)
1067 			{
1068 				break;   // Can't go below zero
1069 			}
1070 		case TextureStage::STAGE_ADDSIGNED:
1071 		case TextureStage::STAGE_ADDSIGNED2X:
1072 		case TextureStage::STAGE_SUBTRACT:
1073 		case TextureStage::STAGE_ADDSMOOTH:
1074 			res.w = Max(res.w, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1075 			break;
1076 		default:
1077 			ASSERT(false);
1078 		}
1079 
1080 		switch(textureStage.stageOperation)
1081 		{
1082 		case TextureStage::STAGE_DISABLE:
1083 		case TextureStage::STAGE_SELECTARG1:
1084 		case TextureStage::STAGE_SELECTARG2:
1085 		case TextureStage::STAGE_SELECTARG3:
1086 		case TextureStage::STAGE_MODULATE:
1087 		case TextureStage::STAGE_SUBTRACT:
1088 		case TextureStage::STAGE_ADDSMOOTH:
1089 		case TextureStage::STAGE_LERP:
1090 		case TextureStage::STAGE_BLENDCURRENTALPHA:
1091 		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1092 		case TextureStage::STAGE_BLENDFACTORALPHA:
1093 		case TextureStage::STAGE_BLENDTEXTUREALPHA:
1094 		case TextureStage::STAGE_DOT3:   // Already clamped
1095 		case TextureStage::STAGE_PREMODULATE:
1096 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1097 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1098 		case TextureStage::STAGE_BUMPENVMAP:
1099 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1100 			break;   // Can't go above one
1101 		case TextureStage::STAGE_MODULATE2X:
1102 		case TextureStage::STAGE_MODULATE4X:
1103 		case TextureStage::STAGE_ADD:
1104 		case TextureStage::STAGE_ADDSIGNED:
1105 		case TextureStage::STAGE_ADDSIGNED2X:
1106 		case TextureStage::STAGE_MULTIPLYADD:
1107 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1108 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1109 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1110 			res.x = Min(res.x, Short4(0x1000));
1111 			res.y = Min(res.y, Short4(0x1000));
1112 			res.z = Min(res.z, Short4(0x1000));
1113 			break;
1114 		default:
1115 			ASSERT(false);
1116 		}
1117 
1118 		switch(textureStage.stageOperationAlpha)
1119 		{
1120 		case TextureStage::STAGE_DISABLE:
1121 		case TextureStage::STAGE_SELECTARG1:
1122 		case TextureStage::STAGE_SELECTARG2:
1123 		case TextureStage::STAGE_SELECTARG3:
1124 		case TextureStage::STAGE_MODULATE:
1125 		case TextureStage::STAGE_SUBTRACT:
1126 		case TextureStage::STAGE_ADDSMOOTH:
1127 		case TextureStage::STAGE_LERP:
1128 		case TextureStage::STAGE_BLENDCURRENTALPHA:
1129 		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1130 		case TextureStage::STAGE_BLENDFACTORALPHA:
1131 		case TextureStage::STAGE_BLENDTEXTUREALPHA:
1132 		case TextureStage::STAGE_DOT3:   // Already clamped
1133 		case TextureStage::STAGE_PREMODULATE:
1134 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1135 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1136 		case TextureStage::STAGE_BUMPENVMAP:
1137 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1138 			break;   // Can't go above one
1139 		case TextureStage::STAGE_MODULATE2X:
1140 		case TextureStage::STAGE_MODULATE4X:
1141 		case TextureStage::STAGE_ADD:
1142 		case TextureStage::STAGE_ADDSIGNED:
1143 		case TextureStage::STAGE_ADDSIGNED2X:
1144 		case TextureStage::STAGE_MULTIPLYADD:
1145 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1146 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1147 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1148 			res.w = Min(res.w, Short4(0x1000));
1149 			break;
1150 		default:
1151 			ASSERT(false);
1152 		}
1153 
1154 		switch(textureStage.destinationArgument)
1155 		{
1156 		case TextureStage::DESTINATION_CURRENT:
1157 			current.x = res.x;
1158 			current.y = res.y;
1159 			current.z = res.z;
1160 			current.w = res.w;
1161 			break;
1162 		case TextureStage::DESTINATION_TEMP:
1163 			temp.x = res.x;
1164 			temp.y = res.y;
1165 			temp.z = res.z;
1166 			temp.w = res.w;
1167 			break;
1168 		default:
1169 			ASSERT(false);
1170 		}
1171 	}
1172 
fogBlend(Vector4s & current,Float4 & f)1173 	void PixelPipeline::fogBlend(Vector4s &current, Float4 &f)
1174 	{
1175 		if(!state.fogActive)
1176 		{
1177 			return;
1178 		}
1179 
1180 		if(state.pixelFogMode != FOG_NONE)
1181 		{
1182 			pixelFog(f);
1183 		}
1184 
1185 		UShort4 fog = convertFixed16(f, true);
1186 
1187 		current.x = As<Short4>(MulHigh(As<UShort4>(current.x), fog));
1188 		current.y = As<Short4>(MulHigh(As<UShort4>(current.y), fog));
1189 		current.z = As<Short4>(MulHigh(As<UShort4>(current.z), fog));
1190 
1191 		UShort4 invFog = UShort4(0xFFFFu) - fog;
1192 
1193 		current.x += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[0]))));
1194 		current.y += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[1]))));
1195 		current.z += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[2]))));
1196 	}
1197 
specularPixel(Vector4s & current,Vector4s & specular)1198 	void PixelPipeline::specularPixel(Vector4s &current, Vector4s &specular)
1199 	{
1200 		if(!state.specularAdd)
1201 		{
1202 			return;
1203 		}
1204 
1205 		current.x = AddSat(current.x, specular.x);
1206 		current.y = AddSat(current.y, specular.y);
1207 		current.z = AddSat(current.z, specular.z);
1208 	}
1209 
sampleTexture(Vector4s & c,int coordinates,int stage,bool project)1210 	void PixelPipeline::sampleTexture(Vector4s &c, int coordinates, int stage, bool project)
1211 	{
1212 		Float4 x = v[2 + coordinates].x;
1213 		Float4 y = v[2 + coordinates].y;
1214 		Float4 z = v[2 + coordinates].z;
1215 		Float4 w = v[2 + coordinates].w;
1216 
1217 		if(perturbate)
1218 		{
1219 			x += du;
1220 			y += dv;
1221 
1222 			perturbate = false;
1223 		}
1224 
1225 		sampleTexture(c, stage, x, y, z, w, project);
1226 	}
1227 
sampleTexture(Vector4s & c,int stage,Float4 & u,Float4 & v,Float4 & w,Float4 & q,bool project)1228 	void PixelPipeline::sampleTexture(Vector4s &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project)
1229 	{
1230 		#if PERF_PROFILE
1231 			Long texTime = Ticks();
1232 		#endif
1233 
1234 		Vector4f dsx;
1235 		Vector4f dsy;
1236 
1237 		Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + stage * sizeof(Texture);
1238 
1239 		if(!project)
1240 		{
1241 			sampler[stage]->sampleTexture(texture, c, u, v, w, q, dsx, dsy);
1242 		}
1243 		else
1244 		{
1245 			Float4 rq = reciprocal(q);
1246 
1247 			Float4 u_q = u * rq;
1248 			Float4 v_q = v * rq;
1249 			Float4 w_q = w * rq;
1250 
1251 			sampler[stage]->sampleTexture(texture, c, u_q, v_q, w_q, q, dsx, dsy);
1252 		}
1253 
1254 		#if PERF_PROFILE
1255 			cycles[PERF_TEX] += Ticks() - texTime;
1256 		#endif
1257 	}
1258 
convertFixed12(RValue<Float4> cf)1259 	Short4 PixelPipeline::convertFixed12(RValue<Float4> cf)
1260 	{
1261 		return RoundShort4(cf * Float4(0x1000));
1262 	}
1263 
convertFixed12(Vector4s & cs,Vector4f & cf)1264 	void PixelPipeline::convertFixed12(Vector4s &cs, Vector4f &cf)
1265 	{
1266 		cs.x = convertFixed12(cf.x);
1267 		cs.y = convertFixed12(cf.y);
1268 		cs.z = convertFixed12(cf.z);
1269 		cs.w = convertFixed12(cf.w);
1270 	}
1271 
convertSigned12(Short4 & cs)1272 	Float4 PixelPipeline::convertSigned12(Short4 &cs)
1273 	{
1274 		return Float4(cs) * Float4(1.0f / 0x0FFE);
1275 	}
1276 
convertSigned12(Vector4f & cf,Vector4s & cs)1277 	void PixelPipeline::convertSigned12(Vector4f &cf, Vector4s &cs)
1278 	{
1279 		cf.x = convertSigned12(cs.x);
1280 		cf.y = convertSigned12(cs.y);
1281 		cf.z = convertSigned12(cs.z);
1282 		cf.w = convertSigned12(cs.w);
1283 	}
1284 
writeDestination(Vector4s & d,const Dst & dst)1285 	void PixelPipeline::writeDestination(Vector4s &d, const Dst &dst)
1286 	{
1287 		switch(dst.type)
1288 		{
1289 		case Shader::PARAMETER_TEMP:
1290 			if(dst.mask & 0x1) rs[dst.index].x = d.x;
1291 			if(dst.mask & 0x2) rs[dst.index].y = d.y;
1292 			if(dst.mask & 0x4) rs[dst.index].z = d.z;
1293 			if(dst.mask & 0x8) rs[dst.index].w = d.w;
1294 			break;
1295 		case Shader::PARAMETER_INPUT:
1296 			if(dst.mask & 0x1) vs[dst.index].x = d.x;
1297 			if(dst.mask & 0x2) vs[dst.index].y = d.y;
1298 			if(dst.mask & 0x4) vs[dst.index].z = d.z;
1299 			if(dst.mask & 0x8) vs[dst.index].w = d.w;
1300 			break;
1301 		case Shader::PARAMETER_CONST: ASSERT(false); break;
1302 		case Shader::PARAMETER_TEXTURE:
1303 			if(dst.mask & 0x1) ts[dst.index].x = d.x;
1304 			if(dst.mask & 0x2) ts[dst.index].y = d.y;
1305 			if(dst.mask & 0x4) ts[dst.index].z = d.z;
1306 			if(dst.mask & 0x8) ts[dst.index].w = d.w;
1307 			break;
1308 		case Shader::PARAMETER_COLOROUT:
1309 			if(dst.mask & 0x1) vs[dst.index].x = d.x;
1310 			if(dst.mask & 0x2) vs[dst.index].y = d.y;
1311 			if(dst.mask & 0x4) vs[dst.index].z = d.z;
1312 			if(dst.mask & 0x8) vs[dst.index].w = d.w;
1313 			break;
1314 		default:
1315 			ASSERT(false);
1316 		}
1317 	}
1318 
fetchRegister(const Src & src)1319 	Vector4s PixelPipeline::fetchRegister(const Src &src)
1320 	{
1321 		Vector4s *reg;
1322 		int i = src.index;
1323 
1324 		Vector4s c;
1325 
1326 		if(src.type == Shader::PARAMETER_CONST)
1327 		{
1328 			c.x = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][0]));
1329 			c.y = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][1]));
1330 			c.z = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][2]));
1331 			c.w = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][3]));
1332 		}
1333 
1334 		switch(src.type)
1335 		{
1336 		case Shader::PARAMETER_TEMP:          reg = &rs[i]; break;
1337 		case Shader::PARAMETER_INPUT:         reg = &vs[i]; break;
1338 		case Shader::PARAMETER_CONST:         reg = &c;       break;
1339 		case Shader::PARAMETER_TEXTURE:       reg = &ts[i]; break;
1340 		case Shader::PARAMETER_VOID:          return rs[0]; // Dummy
1341 		case Shader::PARAMETER_FLOAT4LITERAL: return rs[0]; // Dummy
1342 		default: ASSERT(false); return rs[0];
1343 		}
1344 
1345 		const Short4 &x = (*reg)[(src.swizzle >> 0) & 0x3];
1346 		const Short4 &y = (*reg)[(src.swizzle >> 2) & 0x3];
1347 		const Short4 &z = (*reg)[(src.swizzle >> 4) & 0x3];
1348 		const Short4 &w = (*reg)[(src.swizzle >> 6) & 0x3];
1349 
1350 		Vector4s mod;
1351 
1352 		switch(src.modifier)
1353 		{
1354 		case Shader::MODIFIER_NONE:
1355 			mod.x = x;
1356 			mod.y = y;
1357 			mod.z = z;
1358 			mod.w = w;
1359 			break;
1360 		case Shader::MODIFIER_BIAS:
1361 			mod.x = SubSat(x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1362 			mod.y = SubSat(y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1363 			mod.z = SubSat(z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1364 			mod.w = SubSat(w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1365 			break;
1366 		case Shader::MODIFIER_BIAS_NEGATE:
1367 			mod.x = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), x);
1368 			mod.y = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), y);
1369 			mod.z = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), z);
1370 			mod.w = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), w);
1371 			break;
1372 		case Shader::MODIFIER_COMPLEMENT:
1373 			mod.x = SubSat(Short4(0x1000), x);
1374 			mod.y = SubSat(Short4(0x1000), y);
1375 			mod.z = SubSat(Short4(0x1000), z);
1376 			mod.w = SubSat(Short4(0x1000), w);
1377 			break;
1378 		case Shader::MODIFIER_NEGATE:
1379 			mod.x = -x;
1380 			mod.y = -y;
1381 			mod.z = -z;
1382 			mod.w = -w;
1383 			break;
1384 		case Shader::MODIFIER_X2:
1385 			mod.x = AddSat(x, x);
1386 			mod.y = AddSat(y, y);
1387 			mod.z = AddSat(z, z);
1388 			mod.w = AddSat(w, w);
1389 			break;
1390 		case Shader::MODIFIER_X2_NEGATE:
1391 			mod.x = -AddSat(x, x);
1392 			mod.y = -AddSat(y, y);
1393 			mod.z = -AddSat(z, z);
1394 			mod.w = -AddSat(w, w);
1395 			break;
1396 		case Shader::MODIFIER_SIGN:
1397 			mod.x = SubSat(x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1398 			mod.y = SubSat(y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1399 			mod.z = SubSat(z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1400 			mod.w = SubSat(w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
1401 			mod.x = AddSat(mod.x, mod.x);
1402 			mod.y = AddSat(mod.y, mod.y);
1403 			mod.z = AddSat(mod.z, mod.z);
1404 			mod.w = AddSat(mod.w, mod.w);
1405 			break;
1406 		case Shader::MODIFIER_SIGN_NEGATE:
1407 			mod.x = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), x);
1408 			mod.y = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), y);
1409 			mod.z = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), z);
1410 			mod.w = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), w);
1411 			mod.x = AddSat(mod.x, mod.x);
1412 			mod.y = AddSat(mod.y, mod.y);
1413 			mod.z = AddSat(mod.z, mod.z);
1414 			mod.w = AddSat(mod.w, mod.w);
1415 			break;
1416 		case Shader::MODIFIER_DZ:
1417 			mod.x = x;
1418 			mod.y = y;
1419 			mod.z = z;
1420 			mod.w = w;
1421 			// Projection performed by texture sampler
1422 			break;
1423 		case Shader::MODIFIER_DW:
1424 			mod.x = x;
1425 			mod.y = y;
1426 			mod.z = z;
1427 			mod.w = w;
1428 			// Projection performed by texture sampler
1429 			break;
1430 		default:
1431 			ASSERT(false);
1432 		}
1433 
1434 		if(src.type == Shader::PARAMETER_CONST && (src.modifier == Shader::MODIFIER_X2 || src.modifier == Shader::MODIFIER_X2_NEGATE))
1435 		{
1436 			mod.x = Min(mod.x, Short4(0x1000)); mod.x = Max(mod.x, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
1437 			mod.y = Min(mod.y, Short4(0x1000)); mod.y = Max(mod.y, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
1438 			mod.z = Min(mod.z, Short4(0x1000)); mod.z = Max(mod.z, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
1439 			mod.w = Min(mod.w, Short4(0x1000)); mod.w = Max(mod.w, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
1440 		}
1441 
1442 		return mod;
1443 	}
1444 
MOV(Vector4s & dst,Vector4s & src0)1445 	void PixelPipeline::MOV(Vector4s &dst, Vector4s &src0)
1446 	{
1447 		dst.x = src0.x;
1448 		dst.y = src0.y;
1449 		dst.z = src0.z;
1450 		dst.w = src0.w;
1451 	}
1452 
ADD(Vector4s & dst,Vector4s & src0,Vector4s & src1)1453 	void PixelPipeline::ADD(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1454 	{
1455 		dst.x = AddSat(src0.x, src1.x);
1456 		dst.y = AddSat(src0.y, src1.y);
1457 		dst.z = AddSat(src0.z, src1.z);
1458 		dst.w = AddSat(src0.w, src1.w);
1459 	}
1460 
SUB(Vector4s & dst,Vector4s & src0,Vector4s & src1)1461 	void PixelPipeline::SUB(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1462 	{
1463 		dst.x = SubSat(src0.x, src1.x);
1464 		dst.y = SubSat(src0.y, src1.y);
1465 		dst.z = SubSat(src0.z, src1.z);
1466 		dst.w = SubSat(src0.w, src1.w);
1467 	}
1468 
MAD(Vector4s & dst,Vector4s & src0,Vector4s & src1,Vector4s & src2)1469 	void PixelPipeline::MAD(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1470 	{
1471 		// FIXME: Long fixed-point multiply fixup
1472 		{ dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
1473 		{
1474 		dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y);
1475 	}
1476 		{dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
1477 		{dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
1478 	}
1479 
MUL(Vector4s & dst,Vector4s & src0,Vector4s & src1)1480 	void PixelPipeline::MUL(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1481 	{
1482 		// FIXME: Long fixed-point multiply fixup
1483 		{ dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); }
1484 		{
1485 		dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y);
1486 	}
1487 		{dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); }
1488 		{dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); }
1489 	}
1490 
DP3(Vector4s & dst,Vector4s & src0,Vector4s & src1)1491 	void PixelPipeline::DP3(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1492 	{
1493 		Short4 t0;
1494 		Short4 t1;
1495 
1496 		// FIXME: Long fixed-point multiply fixup
1497 		t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
1498 		t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1499 		t0 = AddSat(t0, t1);
1500 		t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1501 		t0 = AddSat(t0, t1);
1502 
1503 		dst.x = t0;
1504 		dst.y = t0;
1505 		dst.z = t0;
1506 		dst.w = t0;
1507 	}
1508 
DP4(Vector4s & dst,Vector4s & src0,Vector4s & src1)1509 	void PixelPipeline::DP4(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1510 	{
1511 		Short4 t0;
1512 		Short4 t1;
1513 
1514 		// FIXME: Long fixed-point multiply fixup
1515 		t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
1516 		t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1517 		t0 = AddSat(t0, t1);
1518 		t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1519 		t0 = AddSat(t0, t1);
1520 		t1 = MulHigh(src0.w, src1.w); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1521 		t0 = AddSat(t0, t1);
1522 
1523 		dst.x = t0;
1524 		dst.y = t0;
1525 		dst.z = t0;
1526 		dst.w = t0;
1527 	}
1528 
LRP(Vector4s & dst,Vector4s & src0,Vector4s & src1,Vector4s & src2)1529 	void PixelPipeline::LRP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1530 	{
1531 		// FIXME: Long fixed-point multiply fixup
1532 		{ dst.x = SubSat(src1.x, src2.x); dst.x = MulHigh(dst.x, src0.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
1533 		{
1534 		dst.y = SubSat(src1.y, src2.y); dst.y = MulHigh(dst.y, src0.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y);
1535 	}
1536 		{dst.z = SubSat(src1.z, src2.z); dst.z = MulHigh(dst.z, src0.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
1537 		{dst.w = SubSat(src1.w, src2.w); dst.w = MulHigh(dst.w, src0.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
1538 	}
1539 
TEXCOORD(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int coordinate)1540 	void PixelPipeline::TEXCOORD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate)
1541 	{
1542 		Float4 uw;
1543 		Float4 vw;
1544 		Float4 sw;
1545 
1546 		if(state.interpolant[2 + coordinate].component & 0x01)
1547 		{
1548 			uw = Max(u, Float4(0.0f));
1549 			uw = Min(uw, Float4(1.0f));
1550 			dst.x = convertFixed12(uw);
1551 		}
1552 		else
1553 		{
1554 			dst.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1555 		}
1556 
1557 		if(state.interpolant[2 + coordinate].component & 0x02)
1558 		{
1559 			vw = Max(v, Float4(0.0f));
1560 			vw = Min(vw, Float4(1.0f));
1561 			dst.y = convertFixed12(vw);
1562 		}
1563 		else
1564 		{
1565 			dst.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1566 		}
1567 
1568 		if(state.interpolant[2 + coordinate].component & 0x04)
1569 		{
1570 			sw = Max(s, Float4(0.0f));
1571 			sw = Min(sw, Float4(1.0f));
1572 			dst.z = convertFixed12(sw);
1573 		}
1574 		else
1575 		{
1576 			dst.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1577 		}
1578 
1579 		dst.w = Short4(0x1000);
1580 	}
1581 
TEXCRD(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int coordinate,bool project)1582 	void PixelPipeline::TEXCRD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project)
1583 	{
1584 		Float4 uw = u;
1585 		Float4 vw = v;
1586 		Float4 sw = s;
1587 
1588 		if(project)
1589 		{
1590 			uw *= Rcp_pp(s);
1591 			vw *= Rcp_pp(s);
1592 		}
1593 
1594 		if(state.interpolant[2 + coordinate].component & 0x01)
1595 		{
1596 			uw *= Float4(0x1000);
1597 			uw = Max(uw, Float4(-0x8000));
1598 			uw = Min(uw, Float4(0x7FFF));
1599 			dst.x = RoundShort4(uw);
1600 		}
1601 		else
1602 		{
1603 			dst.x = Short4(0x0000);
1604 		}
1605 
1606 		if(state.interpolant[2 + coordinate].component & 0x02)
1607 		{
1608 			vw *= Float4(0x1000);
1609 			vw = Max(vw, Float4(-0x8000));
1610 			vw = Min(vw, Float4(0x7FFF));
1611 			dst.y = RoundShort4(vw);
1612 		}
1613 		else
1614 		{
1615 			dst.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1616 		}
1617 
1618 		if(state.interpolant[2 + coordinate].component & 0x04)
1619 		{
1620 			sw *= Float4(0x1000);
1621 			sw = Max(sw, Float4(-0x8000));
1622 			sw = Min(sw, Float4(0x7FFF));
1623 			dst.z = RoundShort4(sw);
1624 		}
1625 		else
1626 		{
1627 			dst.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
1628 		}
1629 	}
1630 
TEXDP3(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,Vector4s & src)1631 	void PixelPipeline::TEXDP3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src)
1632 	{
1633 		TEXM3X3PAD(u, v, s, src, 0, false);
1634 
1635 		Short4 t0 = RoundShort4(u_ * Float4(0x1000));
1636 
1637 		dst.x = t0;
1638 		dst.y = t0;
1639 		dst.z = t0;
1640 		dst.w = t0;
1641 	}
1642 
TEXDP3TEX(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int stage,Vector4s & src0)1643 	void PixelPipeline::TEXDP3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0)
1644 	{
1645 		TEXM3X3PAD(u, v, s, src0, 0, false);
1646 
1647 		v_ = Float4(0.0f);
1648 		w_ = Float4(0.0f);
1649 
1650 		sampleTexture(dst, stage, u_, v_, w_, w_);
1651 	}
1652 
TEXKILL(Int cMask[4],Float4 & u,Float4 & v,Float4 & s)1653 	void PixelPipeline::TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s)
1654 	{
1655 		Int kill = SignMask(CmpNLT(u, Float4(0.0f))) &
1656 			SignMask(CmpNLT(v, Float4(0.0f))) &
1657 			SignMask(CmpNLT(s, Float4(0.0f)));
1658 
1659 		for(unsigned int q = 0; q < state.multiSample; q++)
1660 		{
1661 			cMask[q] &= kill;
1662 		}
1663 	}
1664 
TEXKILL(Int cMask[4],Vector4s & src)1665 	void PixelPipeline::TEXKILL(Int cMask[4], Vector4s &src)
1666 	{
1667 		Short4 test = src.x | src.y | src.z;
1668 		Int kill = SignMask(Pack(test, test)) ^ 0x0000000F;
1669 
1670 		for(unsigned int q = 0; q < state.multiSample; q++)
1671 		{
1672 			cMask[q] &= kill;
1673 		}
1674 	}
1675 
TEX(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int sampler,bool project)1676 	void PixelPipeline::TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int sampler, bool project)
1677 	{
1678 		sampleTexture(dst, sampler, u, v, s, s, project);
1679 	}
1680 
TEXLD(Vector4s & dst,Vector4s & src,int sampler,bool project)1681 	void PixelPipeline::TEXLD(Vector4s &dst, Vector4s &src, int sampler, bool project)
1682 	{
1683 		Float4 u = Float4(src.x) * Float4(1.0f / 0x0FFE);
1684 		Float4 v = Float4(src.y) * Float4(1.0f / 0x0FFE);
1685 		Float4 s = Float4(src.z) * Float4(1.0f / 0x0FFE);
1686 
1687 		sampleTexture(dst, sampler, u, v, s, s, project);
1688 	}
1689 
TEXBEM(Vector4s & dst,Vector4s & src,Float4 & u,Float4 & v,Float4 & s,int stage)1690 	void PixelPipeline::TEXBEM(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
1691 	{
1692 		Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
1693 		Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
1694 
1695 		Float4 du2 = du;
1696 		Float4 dv2 = dv;
1697 
1698 		du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
1699 		dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
1700 		du += dv2;
1701 		dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
1702 		du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
1703 		dv += du2;
1704 
1705 		Float4 u_ = u + du;
1706 		Float4 v_ = v + dv;
1707 
1708 		sampleTexture(dst, stage, u_, v_, s, s);
1709 	}
1710 
TEXBEML(Vector4s & dst,Vector4s & src,Float4 & u,Float4 & v,Float4 & s,int stage)1711 	void PixelPipeline::TEXBEML(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
1712 	{
1713 		Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
1714 		Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
1715 
1716 		Float4 du2 = du;
1717 		Float4 dv2 = dv;
1718 
1719 		du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
1720 		dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
1721 		du += dv2;
1722 		dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
1723 		du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
1724 		dv += du2;
1725 
1726 		Float4 u_ = u + du;
1727 		Float4 v_ = v + dv;
1728 
1729 		sampleTexture(dst, stage, u_, v_, s, s);
1730 
1731 		Short4 L;
1732 
1733 		L = src.z;
1734 		L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
1735 		L = L << 4;
1736 		L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
1737 		L = Max(L, Short4(0x0000, 0x0000, 0x0000, 0x0000));
1738 		L = Min(L, Short4(0x1000));
1739 
1740 		dst.x = MulHigh(dst.x, L); dst.x = dst.x << 4;
1741 		dst.y = MulHigh(dst.y, L); dst.y = dst.y << 4;
1742 		dst.z = MulHigh(dst.z, L); dst.z = dst.z << 4;
1743 	}
1744 
TEXREG2AR(Vector4s & dst,Vector4s & src0,int stage)1745 	void PixelPipeline::TEXREG2AR(Vector4s &dst, Vector4s &src0, int stage)
1746 	{
1747 		Float4 u = Float4(src0.w) * Float4(1.0f / 0x0FFE);
1748 		Float4 v = Float4(src0.x) * Float4(1.0f / 0x0FFE);
1749 		Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
1750 
1751 		sampleTexture(dst, stage, u, v, s, s);
1752 	}
1753 
TEXREG2GB(Vector4s & dst,Vector4s & src0,int stage)1754 	void PixelPipeline::TEXREG2GB(Vector4s &dst, Vector4s &src0, int stage)
1755 	{
1756 		Float4 u = Float4(src0.y) * Float4(1.0f / 0x0FFE);
1757 		Float4 v = Float4(src0.z) * Float4(1.0f / 0x0FFE);
1758 		Float4 s = v;
1759 
1760 		sampleTexture(dst, stage, u, v, s, s);
1761 	}
1762 
TEXREG2RGB(Vector4s & dst,Vector4s & src0,int stage)1763 	void PixelPipeline::TEXREG2RGB(Vector4s &dst, Vector4s &src0, int stage)
1764 	{
1765 		Float4 u = Float4(src0.x) * Float4(1.0f / 0x0FFE);
1766 		Float4 v = Float4(src0.y) * Float4(1.0f / 0x0FFE);
1767 		Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
1768 
1769 		sampleTexture(dst, stage, u, v, s, s);
1770 	}
1771 
TEXM3X2DEPTH(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,Vector4s & src,bool signedScaling)1772 	void PixelPipeline::TEXM3X2DEPTH(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src, bool signedScaling)
1773 	{
1774 		TEXM3X2PAD(u, v, s, src, 1, signedScaling);
1775 
1776 		// z / w
1777 		u_ *= Rcp_pp(v_);   // FIXME: Set result to 1.0 when division by zero
1778 
1779 		oDepth = u_;
1780 	}
1781 
TEXM3X2PAD(Float4 & u,Float4 & v,Float4 & s,Vector4s & src0,int component,bool signedScaling)1782 	void PixelPipeline::TEXM3X2PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
1783 	{
1784 		TEXM3X3PAD(u, v, s, src0, component, signedScaling);
1785 	}
1786 
TEXM3X2TEX(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int stage,Vector4s & src0,bool signedScaling)1787 	void PixelPipeline::TEXM3X2TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
1788 	{
1789 		TEXM3X2PAD(u, v, s, src0, 1, signedScaling);
1790 
1791 		w_ = Float4(0.0f);
1792 
1793 		sampleTexture(dst, stage, u_, v_, w_, w_);
1794 	}
1795 
TEXM3X3(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,Vector4s & src0,bool signedScaling)1796 	void PixelPipeline::TEXM3X3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, bool signedScaling)
1797 	{
1798 		TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
1799 
1800 		dst.x = RoundShort4(u_ * Float4(0x1000));
1801 		dst.y = RoundShort4(v_ * Float4(0x1000));
1802 		dst.z = RoundShort4(w_ * Float4(0x1000));
1803 		dst.w = Short4(0x1000);
1804 	}
1805 
TEXM3X3PAD(Float4 & u,Float4 & v,Float4 & s,Vector4s & src0,int component,bool signedScaling)1806 	void PixelPipeline::TEXM3X3PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
1807 	{
1808 		if(component == 0 || previousScaling != signedScaling)   // FIXME: Other source modifiers?
1809 		{
1810 			U = Float4(src0.x);
1811 			V = Float4(src0.y);
1812 			W = Float4(src0.z);
1813 
1814 			previousScaling = signedScaling;
1815 		}
1816 
1817 		Float4 x = U * u + V * v + W * s;
1818 
1819 		x *= Float4(1.0f / 0x1000);
1820 
1821 		switch(component)
1822 		{
1823 		case 0:	u_ = x; break;
1824 		case 1:	v_ = x; break;
1825 		case 2: w_ = x; break;
1826 		default: ASSERT(false);
1827 		}
1828 	}
1829 
TEXM3X3SPEC(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int stage,Vector4s & src0,Vector4s & src1)1830 	void PixelPipeline::TEXM3X3SPEC(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, Vector4s &src1)
1831 	{
1832 		TEXM3X3PAD(u, v, s, src0, 2, false);
1833 
1834 		Float4 E[3];   // Eye vector
1835 
1836 		E[0] = Float4(src1.x) * Float4(1.0f / 0x0FFE);
1837 		E[1] = Float4(src1.y) * Float4(1.0f / 0x0FFE);
1838 		E[2] = Float4(src1.z) * Float4(1.0f / 0x0FFE);
1839 
1840 		// Reflection
1841 		Float4 u__;
1842 		Float4 v__;
1843 		Float4 w__;
1844 
1845 		// (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
1846 		u__ = u_ * E[0];
1847 		v__ = v_ * E[1];
1848 		w__ = w_ * E[2];
1849 		u__ += v__ + w__;
1850 		u__ += u__;
1851 		v__ = u__;
1852 		w__ = u__;
1853 		u__ *= u_;
1854 		v__ *= v_;
1855 		w__ *= w_;
1856 		u_ *= u_;
1857 		v_ *= v_;
1858 		w_ *= w_;
1859 		u_ += v_ + w_;
1860 		u__ -= E[0] * u_;
1861 		v__ -= E[1] * u_;
1862 		w__ -= E[2] * u_;
1863 
1864 		sampleTexture(dst, stage, u__, v__, w__, w__);
1865 	}
1866 
TEXM3X3TEX(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int stage,Vector4s & src0,bool signedScaling)1867 	void PixelPipeline::TEXM3X3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
1868 	{
1869 		TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
1870 
1871 		sampleTexture(dst, stage, u_, v_, w_, w_);
1872 	}
1873 
TEXM3X3VSPEC(Vector4s & dst,Float4 & x,Float4 & y,Float4 & z,int stage,Vector4s & src0)1874 	void PixelPipeline::TEXM3X3VSPEC(Vector4s &dst, Float4 &x, Float4 &y, Float4 &z, int stage, Vector4s &src0)
1875 	{
1876 		TEXM3X3PAD(x, y, z, src0, 2, false);
1877 
1878 		Float4 E[3];   // Eye vector
1879 
1880 		E[0] = v[2 + stage - 2].w;
1881 		E[1] = v[2 + stage - 1].w;
1882 		E[2] = v[2 + stage - 0].w;
1883 
1884 		// Reflection
1885 		Float4 u__;
1886 		Float4 v__;
1887 		Float4 w__;
1888 
1889 		// (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
1890 		u__ = u_ * E[0];
1891 		v__ = v_ * E[1];
1892 		w__ = w_ * E[2];
1893 		u__ += v__ + w__;
1894 		u__ += u__;
1895 		v__ = u__;
1896 		w__ = u__;
1897 		u__ *= u_;
1898 		v__ *= v_;
1899 		w__ *= w_;
1900 		u_ *= u_;
1901 		v_ *= v_;
1902 		w_ *= w_;
1903 		u_ += v_ + w_;
1904 		u__ -= E[0] * u_;
1905 		v__ -= E[1] * u_;
1906 		w__ -= E[2] * u_;
1907 
1908 		sampleTexture(dst, stage, u__, v__, w__, w__);
1909 	}
1910 
TEXDEPTH()1911 	void PixelPipeline::TEXDEPTH()
1912 	{
1913 		u_ = Float4(rs[5].x);
1914 		v_ = Float4(rs[5].y);
1915 
1916 		// z / w
1917 		u_ *= Rcp_pp(v_);   // FIXME: Set result to 1.0 when division by zero
1918 
1919 		oDepth = u_;
1920 	}
1921 
CND(Vector4s & dst,Vector4s & src0,Vector4s & src1,Vector4s & src2)1922 	void PixelPipeline::CND(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1923 	{
1924 		{ Short4 t0; t0 = src0.x; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.x; t1 = t1 & t0; t0 = ~t0 & src2.x; t0 = t0 | t1; dst.x = t0; };
1925 		{Short4 t0; t0 = src0.y; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.y; t1 = t1 & t0; t0 = ~t0 & src2.y; t0 = t0 | t1; dst.y = t0; };
1926 		{Short4 t0; t0 = src0.z; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.z; t1 = t1 & t0; t0 = ~t0 & src2.z; t0 = t0 | t1; dst.z = t0; };
1927 		{Short4 t0; t0 = src0.w; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.w; t1 = t1 & t0; t0 = ~t0 & src2.w; t0 = t0 | t1; dst.w = t0; };
1928 	}
1929 
CMP(Vector4s & dst,Vector4s & src0,Vector4s & src1,Vector4s & src2)1930 	void PixelPipeline::CMP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1931 	{
1932 		{ Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.x); Short4 t1; t1 = src2.x; t1 &= t0; t0 = ~t0 & src1.x; t0 |= t1; dst.x = t0; };
1933 		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.y); Short4 t1; t1 = src2.y; t1 &= t0; t0 = ~t0 & src1.y; t0 |= t1; dst.y = t0; };
1934 		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.z); Short4 t1; t1 = src2.z; t1 &= t0; t0 = ~t0 & src1.z; t0 |= t1; dst.z = t0; };
1935 		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.w); Short4 t1; t1 = src2.w; t1 &= t0; t0 = ~t0 & src1.w; t0 |= t1; dst.w = t0; };
1936 	}
1937 
BEM(Vector4s & dst,Vector4s & src0,Vector4s & src1,int stage)1938 	void PixelPipeline::BEM(Vector4s &dst, Vector4s &src0, Vector4s &src1, int stage)
1939 	{
1940 		Short4 t0;
1941 		Short4 t1;
1942 
1943 		// dst.x = src0.x + BUMPENVMAT00(stage) * src1.x + BUMPENVMAT10(stage) * src1.y
1944 		t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][0]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
1945 		t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][0]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
1946 		t0 = AddSat(t0, t1);
1947 		t0 = AddSat(t0, src0.x);
1948 		dst.x = t0;
1949 
1950 		// dst.y = src0.y + BUMPENVMAT01(stage) * src1.x + BUMPENVMAT11(stage) * src1.y
1951 		t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][1]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
1952 		t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][1]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
1953 		t0 = AddSat(t0, t1);
1954 		t0 = AddSat(t0, src0.y);
1955 		dst.y = t0;
1956 	}
1957 }
1958 
1959