1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "PixelRoutine.hpp" 16 17 #include "Renderer.hpp" 18 #include "QuadRasterizer.hpp" 19 #include "Surface.hpp" 20 #include "Primitive.hpp" 21 #include "CPUID.hpp" 22 #include "SamplerCore.hpp" 23 #include "Constants.hpp" 24 #include "Debug.hpp" 25 26 namespace sw 27 { 28 extern bool complementaryDepthBuffer; 29 extern bool postBlendSRGB; 30 extern bool exactColorRounding; 31 extern bool forceClearRegisters; 32 PixelRoutine(const PixelProcessor::State & state,const PixelShader * shader)33 PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput) 34 { 35 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters) 36 { 37 for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++) 38 { 39 v[i].x = Float4(0.0f); 40 v[i].y = Float4(0.0f); 41 v[i].z = Float4(0.0f); 42 v[i].w = Float4(0.0f); 43 } 44 } 45 } 46 ~PixelRoutine()47 PixelRoutine::~PixelRoutine() 48 { 49 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++) 50 { 51 delete sampler[i]; 52 } 53 } 54 quad(Pointer<Byte> cBuffer[RENDERTARGETS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)55 void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) 56 { 57 #if PERF_PROFILE 58 Long pipeTime = Ticks(); 59 #endif 60 61 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++) 62 { 63 sampler[i] = new SamplerCore(constants, state.sampler[i]); 64 } 65 66 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive(); 67 68 Int zMask[4]; // Depth mask 69 Int sMask[4]; // Stencil mask 70 71 for(unsigned int q = 0; q < state.multiSample; q++) 72 { 73 zMask[q] = cMask[q]; 74 sMask[q] = cMask[q]; 75 } 76 77 for(unsigned int q = 0; q < state.multiSample; q++) 78 { 79 stencilTest(sBuffer, q, x, sMask[q], cMask[q]); 80 } 81 82 Float4 f; 83 Float4 rhwCentroid; 84 85 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16); 86 87 if(interpolateZ()) 88 { 89 for(unsigned int q = 0; q < state.multiSample; q++) 90 { 91 Float4 x = xxxx; 92 93 if(state.multiSample > 1) 94 { 95 x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4)); 96 } 97 98 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false); 99 } 100 } 101 102 Bool depthPass = false; 103 104 if(earlyDepthTest) 105 { 106 for(unsigned int q = 0; q < state.multiSample; q++) 107 { 108 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]); 109 } 110 } 111 112 If(depthPass || Bool(!earlyDepthTest)) 113 { 114 #if PERF_PROFILE 115 Long interpTime = Ticks(); 116 #endif 117 118 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16); 119 120 // Centroid locations 121 Float4 XXXX = Float4(0.0f); 122 Float4 YYYY = Float4(0.0f); 123 124 if(state.centroid) 125 { 126 Float4 WWWW(1.0e-9f); 127 128 for(unsigned int q = 0; q < state.multiSample; q++) 129 { 130 XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]); 131 YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]); 132 WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]); 133 } 134 135 WWWW = Rcp_pp(WWWW); 136 XXXX *= WWWW; 137 YYYY *= WWWW; 138 139 XXXX += xxxx; 140 YYYY += yyyy; 141 } 142 143 if(interpolateW()) 144 { 145 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false); 146 rhw = reciprocal(w, false, false, true); 147 148 if(state.centroid) 149 { 150 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false)); 151 } 152 } 153 154 for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++) 155 { 156 for(int component = 0; component < 4; component++) 157 { 158 if(state.interpolant[interpolant].component & (1 << component)) 159 { 160 if(!state.interpolant[interpolant].centroid) 161 { 162 v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective); 163 } 164 else 165 { 166 v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective); 167 } 168 } 169 } 170 171 Float4 rcp; 172 173 switch(state.interpolant[interpolant].project) 174 { 175 case 0: 176 break; 177 case 1: 178 rcp = reciprocal(v[interpolant].y); 179 v[interpolant].x = v[interpolant].x * rcp; 180 break; 181 case 2: 182 rcp = reciprocal(v[interpolant].z); 183 v[interpolant].x = v[interpolant].x * rcp; 184 v[interpolant].y = v[interpolant].y * rcp; 185 break; 186 case 3: 187 rcp = reciprocal(v[interpolant].w); 188 v[interpolant].x = v[interpolant].x * rcp; 189 v[interpolant].y = v[interpolant].y * rcp; 190 v[interpolant].z = v[interpolant].z * rcp; 191 break; 192 } 193 } 194 195 if(state.fog.component) 196 { 197 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective); 198 } 199 200 setBuiltins(x, y, z, w); 201 202 #if PERF_PROFILE 203 cycles[PERF_INTERP] += Ticks() - interpTime; 204 #endif 205 206 Bool alphaPass = true; 207 208 if(colorUsed()) 209 { 210 #if PERF_PROFILE 211 Long shaderTime = Ticks(); 212 #endif 213 214 applyShader(cMask); 215 216 #if PERF_PROFILE 217 cycles[PERF_SHADER] += Ticks() - shaderTime; 218 #endif 219 220 alphaPass = alphaTest(cMask); 221 222 if((shader && shader->containsKill()) || state.alphaTestActive()) 223 { 224 for(unsigned int q = 0; q < state.multiSample; q++) 225 { 226 zMask[q] &= cMask[q]; 227 sMask[q] &= cMask[q]; 228 } 229 } 230 } 231 232 If(alphaPass) 233 { 234 if(!earlyDepthTest) 235 { 236 for(unsigned int q = 0; q < state.multiSample; q++) 237 { 238 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]); 239 } 240 } 241 242 #if PERF_PROFILE 243 Long ropTime = Ticks(); 244 #endif 245 246 If(depthPass || Bool(earlyDepthTest)) 247 { 248 for(unsigned int q = 0; q < state.multiSample; q++) 249 { 250 if(state.multiSampleMask & (1 << q)) 251 { 252 writeDepth(zBuffer, q, x, z[q], zMask[q]); 253 254 if(state.occlusionEnabled) 255 { 256 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q])); 257 } 258 } 259 } 260 261 if(colorUsed()) 262 { 263 #if PERF_PROFILE 264 AddAtomic(Pointer<Long>(&profiler.ropOperations), 4); 265 #endif 266 267 rasterOperation(f, cBuffer, x, sMask, zMask, cMask); 268 } 269 } 270 271 #if PERF_PROFILE 272 cycles[PERF_ROP] += Ticks() - ropTime; 273 #endif 274 } 275 } 276 277 for(unsigned int q = 0; q < state.multiSample; q++) 278 { 279 if(state.multiSampleMask & (1 << q)) 280 { 281 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]); 282 } 283 } 284 285 #if PERF_PROFILE 286 cycles[PERF_PIPE] += Ticks() - pipeTime; 287 #endif 288 } 289 interpolateCentroid(Float4 & x,Float4 & y,Float4 & rhw,Pointer<Byte> planeEquation,bool flat,bool perspective)290 Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective) 291 { 292 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16); 293 294 if(!flat) 295 { 296 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) + 297 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16); 298 299 if(perspective) 300 { 301 interpolant *= rhw; 302 } 303 } 304 305 return interpolant; 306 } 307 stencilTest(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & cMask)308 void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask) 309 { 310 if(!state.stencilActive) 311 { 312 return; 313 } 314 315 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask) 316 317 Pointer<Byte> buffer = sBuffer + 2 * x; 318 319 if(q > 0) 320 { 321 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB)); 322 } 323 324 Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer))); 325 Byte8 valueCCW = value; 326 327 if(!state.noStencilMask) 328 { 329 value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ)); 330 } 331 332 stencilTest(value, state.stencilCompareMode, false); 333 334 if(state.twoSidedStencil) 335 { 336 if(!state.noStencilMaskCCW) 337 { 338 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ)); 339 } 340 341 stencilTest(valueCCW, state.stencilCompareModeCCW, true); 342 343 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)); 344 valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)); 345 value |= valueCCW; 346 } 347 348 sMask = SignMask(value) & cMask; 349 } 350 stencilTest(Byte8 & value,StencilCompareMode stencilCompareMode,bool CCW)351 void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW) 352 { 353 Byte8 equal; 354 355 switch(stencilCompareMode) 356 { 357 case STENCIL_ALWAYS: 358 value = Byte8(0xFFFFFFFFFFFFFFFF); 359 break; 360 case STENCIL_NEVER: 361 value = Byte8(0x0000000000000000); 362 break; 363 case STENCIL_LESS: // a < b ~ b > a 364 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); 365 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ))); 366 break; 367 case STENCIL_EQUAL: 368 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ))); 369 break; 370 case STENCIL_NOTEQUAL: // a != b ~ !(a == b) 371 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ))); 372 value ^= Byte8(0xFFFFFFFFFFFFFFFF); 373 break; 374 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b) 375 equal = value; 376 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ))); 377 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); 378 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ))); 379 value |= equal; 380 break; 381 case STENCIL_GREATER: // a > b 382 equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)); 383 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); 384 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value)); 385 value = equal; 386 break; 387 case STENCIL_GREATEREQUAL: // a >= b ~ !(a < b) ~ !(b > a) 388 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); 389 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ))); 390 value ^= Byte8(0xFFFFFFFFFFFFFFFF); 391 break; 392 default: 393 ASSERT(false); 394 } 395 } 396 depthTest(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & sMask,Int & zMask,Int & cMask)397 Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask) 398 { 399 if(!state.depthTestActive) 400 { 401 return true; 402 } 403 404 Float4 Z = z; 405 406 if(shader && shader->depthOverride()) 407 { 408 if(complementaryDepthBuffer) 409 { 410 Z = Float4(1.0f) - oDepth; 411 } 412 else 413 { 414 Z = oDepth; 415 } 416 } 417 418 Pointer<Byte> buffer; 419 Int pitch; 420 421 if(!state.quadLayoutDepthBuffer) 422 { 423 buffer = zBuffer + 4 * x; 424 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)); 425 } 426 else 427 { 428 buffer = zBuffer + 8 * x; 429 } 430 431 if(q > 0) 432 { 433 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB)); 434 } 435 436 Float4 zValue; 437 438 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable)) 439 { 440 if(!state.quadLayoutDepthBuffer) 441 { 442 // FIXME: Properly optimizes? 443 zValue.xy = *Pointer<Float4>(buffer); 444 zValue.zw = *Pointer<Float4>(buffer + pitch - 8); 445 } 446 else 447 { 448 zValue = *Pointer<Float4>(buffer, 16); 449 } 450 } 451 452 Int4 zTest; 453 454 switch(state.depthCompareMode) 455 { 456 case DEPTH_ALWAYS: 457 // Optimized 458 break; 459 case DEPTH_NEVER: 460 // Optimized 461 break; 462 case DEPTH_EQUAL: 463 zTest = CmpEQ(zValue, Z); 464 break; 465 case DEPTH_NOTEQUAL: 466 zTest = CmpNEQ(zValue, Z); 467 break; 468 case DEPTH_LESS: 469 if(complementaryDepthBuffer) 470 { 471 zTest = CmpLT(zValue, Z); 472 } 473 else 474 { 475 zTest = CmpNLE(zValue, Z); 476 } 477 break; 478 case DEPTH_GREATEREQUAL: 479 if(complementaryDepthBuffer) 480 { 481 zTest = CmpNLT(zValue, Z); 482 } 483 else 484 { 485 zTest = CmpLE(zValue, Z); 486 } 487 break; 488 case DEPTH_LESSEQUAL: 489 if(complementaryDepthBuffer) 490 { 491 zTest = CmpLE(zValue, Z); 492 } 493 else 494 { 495 zTest = CmpNLT(zValue, Z); 496 } 497 break; 498 case DEPTH_GREATER: 499 if(complementaryDepthBuffer) 500 { 501 zTest = CmpNLE(zValue, Z); 502 } 503 else 504 { 505 zTest = CmpLT(zValue, Z); 506 } 507 break; 508 default: 509 ASSERT(false); 510 } 511 512 switch(state.depthCompareMode) 513 { 514 case DEPTH_ALWAYS: 515 zMask = cMask; 516 break; 517 case DEPTH_NEVER: 518 zMask = 0x0; 519 break; 520 default: 521 zMask = SignMask(zTest) & cMask; 522 break; 523 } 524 525 if(state.stencilActive) 526 { 527 zMask &= sMask; 528 } 529 530 return zMask != 0; 531 } 532 alphaTest(Int & aMask,Short4 & alpha)533 void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha) 534 { 535 Short4 cmp; 536 Short4 equal; 537 538 switch(state.alphaCompareMode) 539 { 540 case ALPHA_ALWAYS: 541 aMask = 0xF; 542 break; 543 case ALPHA_NEVER: 544 aMask = 0x0; 545 break; 546 case ALPHA_EQUAL: 547 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); 548 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000))); 549 break; 550 case ALPHA_NOTEQUAL: // a != b ~ !(a == b) 551 cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF); // FIXME 552 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000))); 553 break; 554 case ALPHA_LESS: // a < b ~ b > a 555 cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha); 556 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000))); 557 break; 558 case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate 559 equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); 560 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); 561 cmp |= equal; 562 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000))); 563 break; 564 case ALPHA_LESSEQUAL: // a <= b ~ !(a > b) 565 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF); // FIXME 566 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000))); 567 break; 568 case ALPHA_GREATER: // a > b 569 cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))); 570 aMask = SignMask(Pack(cmp, Short4(0x0000, 0x0000, 0x0000, 0x0000))); 571 break; 572 default: 573 ASSERT(false); 574 } 575 } 576 alphaToCoverage(Int cMask[4],Float4 & alpha)577 void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha) 578 { 579 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0))); 580 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1))); 581 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2))); 582 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3))); 583 584 Int aMask0 = SignMask(coverage0); 585 Int aMask1 = SignMask(coverage1); 586 Int aMask2 = SignMask(coverage2); 587 Int aMask3 = SignMask(coverage3); 588 589 cMask[0] &= aMask0; 590 cMask[1] &= aMask1; 591 cMask[2] &= aMask2; 592 cMask[3] &= aMask3; 593 } 594 fogBlend(Vector4f & c0,Float4 & fog)595 void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog) 596 { 597 if(!state.fogActive) 598 { 599 return; 600 } 601 602 if(state.pixelFogMode != FOG_NONE) 603 { 604 pixelFog(fog); 605 606 fog = Min(fog, Float4(1.0f)); 607 fog = Max(fog, Float4(0.0f)); 608 } 609 610 c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0])); 611 c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1])); 612 c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2])); 613 614 c0.x *= fog; 615 c0.y *= fog; 616 c0.z *= fog; 617 618 c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0])); 619 c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1])); 620 c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2])); 621 } 622 pixelFog(Float4 & visibility)623 void PixelRoutine::pixelFog(Float4 &visibility) 624 { 625 Float4 &zw = visibility; 626 627 if(state.pixelFogMode != FOG_NONE) 628 { 629 if(state.wBasedFog) 630 { 631 zw = rhw; 632 } 633 else 634 { 635 if(complementaryDepthBuffer) 636 { 637 zw = Float4(1.0f) - z[0]; 638 } 639 else 640 { 641 zw = z[0]; 642 } 643 } 644 } 645 646 switch(state.pixelFogMode) 647 { 648 case FOG_NONE: 649 break; 650 case FOG_LINEAR: 651 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale)); 652 zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset)); 653 break; 654 case FOG_EXP: 655 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE)); 656 zw = exponential2(zw, true); 657 break; 658 case FOG_EXP2: 659 zw *= zw; 660 zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E)); 661 zw = exponential2(zw, true); 662 break; 663 default: 664 ASSERT(false); 665 } 666 } 667 writeDepth(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & zMask)668 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask) 669 { 670 if(!state.depthWriteEnable) 671 { 672 return; 673 } 674 675 Float4 Z = z; 676 677 if(shader && shader->depthOverride()) 678 { 679 if(complementaryDepthBuffer) 680 { 681 Z = Float4(1.0f) - oDepth; 682 } 683 else 684 { 685 Z = oDepth; 686 } 687 } 688 689 Pointer<Byte> buffer; 690 Int pitch; 691 692 if(!state.quadLayoutDepthBuffer) 693 { 694 buffer = zBuffer + 4 * x; 695 pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)); 696 } 697 else 698 { 699 buffer = zBuffer + 8 * x; 700 } 701 702 if(q > 0) 703 { 704 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB)); 705 } 706 707 Float4 zValue; 708 709 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable)) 710 { 711 if(!state.quadLayoutDepthBuffer) 712 { 713 // FIXME: Properly optimizes? 714 zValue.xy = *Pointer<Float4>(buffer); 715 zValue.zw = *Pointer<Float4>(buffer + pitch - 8); 716 } 717 else 718 { 719 zValue = *Pointer<Float4>(buffer, 16); 720 } 721 } 722 723 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16)); 724 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16)); 725 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue)); 726 727 if(!state.quadLayoutDepthBuffer) 728 { 729 // FIXME: Properly optimizes? 730 *Pointer<Float2>(buffer) = Float2(Z.xy); 731 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw); 732 } 733 else 734 { 735 *Pointer<Float4>(buffer, 16) = Z; 736 } 737 } 738 writeStencil(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & zMask,Int & cMask)739 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask) 740 { 741 if(!state.stencilActive) 742 { 743 return; 744 } 745 746 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP) 747 { 748 if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP)) 749 { 750 return; 751 } 752 } 753 754 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW)) 755 { 756 return; 757 } 758 759 Pointer<Byte> buffer = sBuffer + 2 * x; 760 761 if(q > 0) 762 { 763 buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB)); 764 } 765 766 Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer))); 767 768 Byte8 newValue; 769 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask); 770 771 if(!state.noStencilWriteMask) 772 { 773 Byte8 maskedValue = bufferValue; 774 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ)); 775 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ)); 776 newValue |= maskedValue; 777 } 778 779 if(state.twoSidedStencil) 780 { 781 Byte8 newValueCCW; 782 783 stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask); 784 785 if(!state.noStencilWriteMaskCCW) 786 { 787 Byte8 maskedValue = bufferValue; 788 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ)); 789 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ)); 790 newValueCCW |= maskedValue; 791 } 792 793 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)); 794 newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)); 795 newValue |= newValueCCW; 796 } 797 798 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask); 799 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask); 800 newValue |= bufferValue; 801 802 *Pointer<UInt>(buffer) = UInt(As<Long>(newValue)); 803 } 804 stencilOperation(Byte8 & newValue,Byte8 & bufferValue,StencilOperation stencilPassOperation,StencilOperation stencilZFailOperation,StencilOperation stencilFailOperation,bool CCW,Int & zMask,Int & sMask)805 void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask) 806 { 807 Byte8 &pass = newValue; 808 Byte8 fail; 809 Byte8 zFail; 810 811 stencilOperation(pass, bufferValue, stencilPassOperation, CCW); 812 813 if(stencilZFailOperation != stencilPassOperation) 814 { 815 stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW); 816 } 817 818 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation) 819 { 820 stencilOperation(fail, bufferValue, stencilFailOperation, CCW); 821 } 822 823 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation) 824 { 825 if(state.depthTestActive && stencilZFailOperation != stencilPassOperation) // zMask valid and values not the same 826 { 827 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask); 828 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask); 829 pass |= zFail; 830 } 831 832 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask); 833 fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask); 834 pass |= fail; 835 } 836 } 837 stencilOperation(Byte8 & output,Byte8 & bufferValue,StencilOperation operation,bool CCW)838 void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW) 839 { 840 switch(operation) 841 { 842 case OPERATION_KEEP: 843 output = bufferValue; 844 break; 845 case OPERATION_ZERO: 846 output = Byte8(0x0000000000000000); 847 break; 848 case OPERATION_REPLACE: 849 output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ)); 850 break; 851 case OPERATION_INCRSAT: 852 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1)); 853 break; 854 case OPERATION_DECRSAT: 855 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1)); 856 break; 857 case OPERATION_INVERT: 858 output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF); 859 break; 860 case OPERATION_INCR: 861 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1); 862 break; 863 case OPERATION_DECR: 864 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1); 865 break; 866 default: 867 ASSERT(false); 868 } 869 } 870 blendFactor(const Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorActive)871 void PixelRoutine::blendFactor(const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorActive) 872 { 873 switch(blendFactorActive) 874 { 875 case BLEND_ZERO: 876 // Optimized 877 break; 878 case BLEND_ONE: 879 // Optimized 880 break; 881 case BLEND_SOURCE: 882 blendFactor.x = current.x; 883 blendFactor.y = current.y; 884 blendFactor.z = current.z; 885 break; 886 case BLEND_INVSOURCE: 887 blendFactor.x = Short4(0xFFFFu) - current.x; 888 blendFactor.y = Short4(0xFFFFu) - current.y; 889 blendFactor.z = Short4(0xFFFFu) - current.z; 890 break; 891 case BLEND_DEST: 892 blendFactor.x = pixel.x; 893 blendFactor.y = pixel.y; 894 blendFactor.z = pixel.z; 895 break; 896 case BLEND_INVDEST: 897 blendFactor.x = Short4(0xFFFFu) - pixel.x; 898 blendFactor.y = Short4(0xFFFFu) - pixel.y; 899 blendFactor.z = Short4(0xFFFFu) - pixel.z; 900 break; 901 case BLEND_SOURCEALPHA: 902 blendFactor.x = current.w; 903 blendFactor.y = current.w; 904 blendFactor.z = current.w; 905 break; 906 case BLEND_INVSOURCEALPHA: 907 blendFactor.x = Short4(0xFFFFu) - current.w; 908 blendFactor.y = Short4(0xFFFFu) - current.w; 909 blendFactor.z = Short4(0xFFFFu) - current.w; 910 break; 911 case BLEND_DESTALPHA: 912 blendFactor.x = pixel.w; 913 blendFactor.y = pixel.w; 914 blendFactor.z = pixel.w; 915 break; 916 case BLEND_INVDESTALPHA: 917 blendFactor.x = Short4(0xFFFFu) - pixel.w; 918 blendFactor.y = Short4(0xFFFFu) - pixel.w; 919 blendFactor.z = Short4(0xFFFFu) - pixel.w; 920 break; 921 case BLEND_SRCALPHASAT: 922 blendFactor.x = Short4(0xFFFFu) - pixel.w; 923 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w)); 924 blendFactor.y = blendFactor.x; 925 blendFactor.z = blendFactor.x; 926 break; 927 case BLEND_CONSTANT: 928 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0])); 929 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1])); 930 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2])); 931 break; 932 case BLEND_INVCONSTANT: 933 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0])); 934 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1])); 935 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2])); 936 break; 937 case BLEND_CONSTANTALPHA: 938 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); 939 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); 940 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); 941 break; 942 case BLEND_INVCONSTANTALPHA: 943 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); 944 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); 945 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); 946 break; 947 default: 948 ASSERT(false); 949 } 950 } 951 blendFactorAlpha(const Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorAlphaActive)952 void PixelRoutine::blendFactorAlpha(const Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, BlendFactor blendFactorAlphaActive) 953 { 954 switch(blendFactorAlphaActive) 955 { 956 case BLEND_ZERO: 957 // Optimized 958 break; 959 case BLEND_ONE: 960 // Optimized 961 break; 962 case BLEND_SOURCE: 963 blendFactor.w = current.w; 964 break; 965 case BLEND_INVSOURCE: 966 blendFactor.w = Short4(0xFFFFu) - current.w; 967 break; 968 case BLEND_DEST: 969 blendFactor.w = pixel.w; 970 break; 971 case BLEND_INVDEST: 972 blendFactor.w = Short4(0xFFFFu) - pixel.w; 973 break; 974 case BLEND_SOURCEALPHA: 975 blendFactor.w = current.w; 976 break; 977 case BLEND_INVSOURCEALPHA: 978 blendFactor.w = Short4(0xFFFFu) - current.w; 979 break; 980 case BLEND_DESTALPHA: 981 blendFactor.w = pixel.w; 982 break; 983 case BLEND_INVDESTALPHA: 984 blendFactor.w = Short4(0xFFFFu) - pixel.w; 985 break; 986 case BLEND_SRCALPHASAT: 987 blendFactor.w = Short4(0xFFFFu); 988 break; 989 case BLEND_CONSTANT: 990 case BLEND_CONSTANTALPHA: 991 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3])); 992 break; 993 case BLEND_INVCONSTANT: 994 case BLEND_INVCONSTANTALPHA: 995 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3])); 996 break; 997 default: 998 ASSERT(false); 999 } 1000 } 1001 isSRGB(int index) const1002 bool PixelRoutine::isSRGB(int index) const 1003 { 1004 return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8; 1005 } 1006 readPixel(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & pixel)1007 void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel) 1008 { 1009 Short4 c01; 1010 Short4 c23; 1011 Pointer<Byte> buffer; 1012 Pointer<Byte> buffer2; 1013 1014 switch(state.targetFormat[index]) 1015 { 1016 case FORMAT_R5G6B5: 1017 buffer = cBuffer + 2 * x; 1018 buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1019 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2))); 1020 1021 pixel.x = c01 & Short4(0xF800u); 1022 pixel.y = (c01 & Short4(0x07E0u)) << 5; 1023 pixel.z = (c01 & Short4(0x001Fu)) << 11; 1024 pixel.w = Short4(0xFFFFu); 1025 break; 1026 case FORMAT_A8R8G8B8: 1027 buffer = cBuffer + 4 * x; 1028 c01 = *Pointer<Short4>(buffer); 1029 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1030 c23 = *Pointer<Short4>(buffer); 1031 pixel.z = c01; 1032 pixel.y = c01; 1033 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); 1034 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); 1035 pixel.x = pixel.z; 1036 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); 1037 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); 1038 pixel.y = pixel.z; 1039 pixel.w = pixel.x; 1040 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x)); 1041 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); 1042 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); 1043 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); 1044 break; 1045 case FORMAT_A8B8G8R8: 1046 case FORMAT_SRGB8_A8: 1047 buffer = cBuffer + 4 * x; 1048 c01 = *Pointer<Short4>(buffer); 1049 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1050 c23 = *Pointer<Short4>(buffer); 1051 pixel.z = c01; 1052 pixel.y = c01; 1053 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); 1054 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); 1055 pixel.x = pixel.z; 1056 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); 1057 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); 1058 pixel.y = pixel.z; 1059 pixel.w = pixel.x; 1060 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); 1061 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); 1062 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); 1063 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); 1064 break; 1065 case FORMAT_A8: 1066 buffer = cBuffer + 1 * x; 1067 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0); 1068 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1069 pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1); 1070 pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); 1071 pixel.x = Short4(0x0000); 1072 pixel.y = Short4(0x0000); 1073 pixel.z = Short4(0x0000); 1074 break; 1075 case FORMAT_X8R8G8B8: 1076 buffer = cBuffer + 4 * x; 1077 c01 = *Pointer<Short4>(buffer); 1078 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1079 c23 = *Pointer<Short4>(buffer); 1080 pixel.z = c01; 1081 pixel.y = c01; 1082 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); 1083 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); 1084 pixel.x = pixel.z; 1085 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); 1086 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); 1087 pixel.y = pixel.z; 1088 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x)); 1089 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); 1090 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); 1091 pixel.w = Short4(0xFFFFu); 1092 break; 1093 case FORMAT_X8B8G8R8: 1094 case FORMAT_SRGB8_X8: 1095 buffer = cBuffer + 4 * x; 1096 c01 = *Pointer<Short4>(buffer); 1097 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1098 c23 = *Pointer<Short4>(buffer); 1099 pixel.z = c01; 1100 pixel.y = c01; 1101 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23)); 1102 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23)); 1103 pixel.x = pixel.z; 1104 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y)); 1105 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y)); 1106 pixel.y = pixel.z; 1107 pixel.w = pixel.x; 1108 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z)); 1109 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y)); 1110 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w)); 1111 pixel.w = Short4(0xFFFFu); 1112 break; 1113 case FORMAT_A8G8R8B8Q: 1114 UNIMPLEMENTED(); 1115 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0)); 1116 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0)); 1117 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8)); 1118 // pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8)); 1119 break; 1120 case FORMAT_X8G8R8B8Q: 1121 UNIMPLEMENTED(); 1122 // pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0)); 1123 // pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0)); 1124 // pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8)); 1125 // pixel.w = Short4(0xFFFFu); 1126 break; 1127 case FORMAT_A16B16G16R16: 1128 buffer = cBuffer; 1129 pixel.x = *Pointer<Short4>(buffer + 8 * x); 1130 pixel.y = *Pointer<Short4>(buffer + 8 * x + 8); 1131 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1132 pixel.z = *Pointer<Short4>(buffer + 8 * x); 1133 pixel.w = *Pointer<Short4>(buffer + 8 * x + 8); 1134 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w); 1135 break; 1136 case FORMAT_G16R16: 1137 buffer = cBuffer; 1138 pixel.x = *Pointer<Short4>(buffer + 4 * x); 1139 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index])); 1140 pixel.y = *Pointer<Short4>(buffer + 4 * x); 1141 pixel.z = pixel.x; 1142 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y)); 1143 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y)); 1144 pixel.y = pixel.z; 1145 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z)); 1146 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z)); 1147 pixel.z = Short4(0xFFFFu); 1148 pixel.w = Short4(0xFFFFu); 1149 break; 1150 default: 1151 ASSERT(false); 1152 } 1153 1154 if((postBlendSRGB && state.writeSRGB) || isSRGB(index)) 1155 { 1156 sRGBtoLinear16_12_16(pixel); 1157 } 1158 } 1159 alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1160 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x) 1161 { 1162 if(!state.alphaBlendActive) 1163 { 1164 return; 1165 } 1166 1167 Vector4s pixel; 1168 readPixel(index, cBuffer, x, pixel); 1169 1170 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor 1171 Vector4s sourceFactor; 1172 Vector4s destFactor; 1173 1174 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor); 1175 blendFactor(destFactor, current, pixel, state.destBlendFactor); 1176 1177 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO) 1178 { 1179 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x)); 1180 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y)); 1181 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z)); 1182 } 1183 1184 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO) 1185 { 1186 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x)); 1187 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y)); 1188 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z)); 1189 } 1190 1191 switch(state.blendOperation) 1192 { 1193 case BLENDOP_ADD: 1194 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x)); 1195 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y)); 1196 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z)); 1197 break; 1198 case BLENDOP_SUB: 1199 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x)); 1200 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y)); 1201 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z)); 1202 break; 1203 case BLENDOP_INVSUB: 1204 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x)); 1205 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y)); 1206 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z)); 1207 break; 1208 case BLENDOP_MIN: 1209 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x)); 1210 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y)); 1211 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z)); 1212 break; 1213 case BLENDOP_MAX: 1214 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x)); 1215 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y)); 1216 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z)); 1217 break; 1218 case BLENDOP_SOURCE: 1219 // No operation 1220 break; 1221 case BLENDOP_DEST: 1222 current.x = pixel.x; 1223 current.y = pixel.y; 1224 current.z = pixel.z; 1225 break; 1226 case BLENDOP_NULL: 1227 current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000); 1228 current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000); 1229 current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000); 1230 break; 1231 default: 1232 ASSERT(false); 1233 } 1234 1235 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha); 1236 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha); 1237 1238 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO) 1239 { 1240 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w)); 1241 } 1242 1243 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO) 1244 { 1245 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w)); 1246 } 1247 1248 switch(state.blendOperationAlpha) 1249 { 1250 case BLENDOP_ADD: 1251 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w)); 1252 break; 1253 case BLENDOP_SUB: 1254 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w)); 1255 break; 1256 case BLENDOP_INVSUB: 1257 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w)); 1258 break; 1259 case BLENDOP_MIN: 1260 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w)); 1261 break; 1262 case BLENDOP_MAX: 1263 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w)); 1264 break; 1265 case BLENDOP_SOURCE: 1266 // No operation 1267 break; 1268 case BLENDOP_DEST: 1269 current.w = pixel.w; 1270 break; 1271 case BLENDOP_NULL: 1272 current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000); 1273 break; 1274 default: 1275 ASSERT(false); 1276 } 1277 } 1278 logicOperation(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1279 void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s ¤t, Int &x) 1280 { 1281 if(state.logicalOperation == LOGICALOP_COPY) 1282 { 1283 return; 1284 } 1285 1286 Vector4s pixel; 1287 readPixel(index, cBuffer, x, pixel); 1288 1289 switch(state.logicalOperation) 1290 { 1291 case LOGICALOP_CLEAR: 1292 current.x = 0; 1293 current.y = 0; 1294 current.z = 0; 1295 break; 1296 case LOGICALOP_SET: 1297 current.x = 0xFFFFu; 1298 current.y = 0xFFFFu; 1299 current.z = 0xFFFFu; 1300 break; 1301 case LOGICALOP_COPY: 1302 ASSERT(false); // Optimized out 1303 break; 1304 case LOGICALOP_COPY_INVERTED: 1305 current.x = ~current.x; 1306 current.y = ~current.y; 1307 current.z = ~current.z; 1308 break; 1309 case LOGICALOP_NOOP: 1310 current.x = pixel.x; 1311 current.y = pixel.y; 1312 current.z = pixel.z; 1313 break; 1314 case LOGICALOP_INVERT: 1315 current.x = ~pixel.x; 1316 current.y = ~pixel.y; 1317 current.z = ~pixel.z; 1318 break; 1319 case LOGICALOP_AND: 1320 current.x = pixel.x & current.x; 1321 current.y = pixel.y & current.y; 1322 current.z = pixel.z & current.z; 1323 break; 1324 case LOGICALOP_NAND: 1325 current.x = ~(pixel.x & current.x); 1326 current.y = ~(pixel.y & current.y); 1327 current.z = ~(pixel.z & current.z); 1328 break; 1329 case LOGICALOP_OR: 1330 current.x = pixel.x | current.x; 1331 current.y = pixel.y | current.y; 1332 current.z = pixel.z | current.z; 1333 break; 1334 case LOGICALOP_NOR: 1335 current.x = ~(pixel.x | current.x); 1336 current.y = ~(pixel.y | current.y); 1337 current.z = ~(pixel.z | current.z); 1338 break; 1339 case LOGICALOP_XOR: 1340 current.x = pixel.x ^ current.x; 1341 current.y = pixel.y ^ current.y; 1342 current.z = pixel.z ^ current.z; 1343 break; 1344 case LOGICALOP_EQUIV: 1345 current.x = ~(pixel.x ^ current.x); 1346 current.y = ~(pixel.y ^ current.y); 1347 current.z = ~(pixel.z ^ current.z); 1348 break; 1349 case LOGICALOP_AND_REVERSE: 1350 current.x = ~pixel.x & current.x; 1351 current.y = ~pixel.y & current.y; 1352 current.z = ~pixel.z & current.z; 1353 break; 1354 case LOGICALOP_AND_INVERTED: 1355 current.x = pixel.x & ~current.x; 1356 current.y = pixel.y & ~current.y; 1357 current.z = pixel.z & ~current.z; 1358 break; 1359 case LOGICALOP_OR_REVERSE: 1360 current.x = ~pixel.x | current.x; 1361 current.y = ~pixel.y | current.y; 1362 current.z = ~pixel.z | current.z; 1363 break; 1364 case LOGICALOP_OR_INVERTED: 1365 current.x = pixel.x | ~current.x; 1366 current.y = pixel.y | ~current.y; 1367 current.z = pixel.z | ~current.z; 1368 break; 1369 default: 1370 ASSERT(false); 1371 } 1372 } 1373 writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & current,Int & sMask,Int & zMask,Int & cMask)1374 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s ¤t, Int &sMask, Int &zMask, Int &cMask) 1375 { 1376 if((postBlendSRGB && state.writeSRGB) || isSRGB(index)) 1377 { 1378 linearToSRGB16_12_16(current); 1379 } 1380 1381 if(exactColorRounding) 1382 { 1383 switch(state.targetFormat[index]) 1384 { 1385 case FORMAT_R5G6B5: 1386 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400)); 1387 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200)); 1388 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400)); 1389 break; 1390 case FORMAT_X8G8R8B8Q: 1391 case FORMAT_A8G8R8B8Q: 1392 case FORMAT_X8R8G8B8: 1393 case FORMAT_X8B8G8R8: 1394 case FORMAT_A8R8G8B8: 1395 case FORMAT_A8B8G8R8: 1396 case FORMAT_SRGB8_X8: 1397 case FORMAT_SRGB8_A8: 1398 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080); 1399 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080); 1400 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080); 1401 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080); 1402 break; 1403 default: 1404 break; 1405 } 1406 } 1407 1408 int rgbaWriteMask = state.colorWriteActive(index); 1409 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2; 1410 int brgaWriteMask = (rgbaWriteMask & 0x00000008) | (rgbaWriteMask & 0x00000001) << 1 | (rgbaWriteMask & 0x00000002) << 1 | (rgbaWriteMask & 0x00000004) >> 2; 1411 1412 switch(state.targetFormat[index]) 1413 { 1414 case FORMAT_R5G6B5: 1415 { 1416 current.x = current.x & Short4(0xF800u); 1417 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5; 1418 current.z = As<UShort4>(current.z) >> 11; 1419 1420 current.x = current.x | current.y | current.z; 1421 } 1422 break; 1423 case FORMAT_X8G8R8B8Q: 1424 UNIMPLEMENTED(); 1425 // current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1426 // current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1427 // current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1428 1429 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x))); 1430 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y))); 1431 break; 1432 case FORMAT_A8G8R8B8Q: 1433 UNIMPLEMENTED(); 1434 // current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1435 // current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1436 // current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1437 // current.w = As<Short4>(As<UShort4>(current.w) >> 8); 1438 1439 // current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x))); 1440 // current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w))); 1441 break; 1442 case FORMAT_X8R8G8B8: 1443 case FORMAT_A8R8G8B8: 1444 if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7) 1445 { 1446 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1447 current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1448 current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1449 1450 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x))); 1451 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y))); 1452 1453 current.x = current.z; 1454 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); 1455 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); 1456 current.y = current.z; 1457 current.z = As<Short4>(UnpackLow(current.z, current.x)); 1458 current.y = As<Short4>(UnpackHigh(current.y, current.x)); 1459 } 1460 else 1461 { 1462 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1463 current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1464 current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1465 current.w = As<Short4>(As<UShort4>(current.w) >> 8); 1466 1467 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x))); 1468 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w))); 1469 1470 current.x = current.z; 1471 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); 1472 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); 1473 current.y = current.z; 1474 current.z = As<Short4>(UnpackLow(current.z, current.x)); 1475 current.y = As<Short4>(UnpackHigh(current.y, current.x)); 1476 } 1477 break; 1478 case FORMAT_X8B8G8R8: 1479 case FORMAT_A8B8G8R8: 1480 case FORMAT_SRGB8_X8: 1481 case FORMAT_SRGB8_A8: 1482 if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7) 1483 { 1484 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1485 current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1486 current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1487 1488 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z))); 1489 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y))); 1490 1491 current.x = current.z; 1492 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); 1493 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); 1494 current.y = current.z; 1495 current.z = As<Short4>(UnpackLow(current.z, current.x)); 1496 current.y = As<Short4>(UnpackHigh(current.y, current.x)); 1497 } 1498 else 1499 { 1500 current.x = As<Short4>(As<UShort4>(current.x) >> 8); 1501 current.y = As<Short4>(As<UShort4>(current.y) >> 8); 1502 current.z = As<Short4>(As<UShort4>(current.z) >> 8); 1503 current.w = As<Short4>(As<UShort4>(current.w) >> 8); 1504 1505 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z))); 1506 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w))); 1507 1508 current.x = current.z; 1509 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y)); 1510 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y)); 1511 current.y = current.z; 1512 current.z = As<Short4>(UnpackLow(current.z, current.x)); 1513 current.y = As<Short4>(UnpackHigh(current.y, current.x)); 1514 } 1515 break; 1516 case FORMAT_A8: 1517 current.w = As<Short4>(As<UShort4>(current.w) >> 8); 1518 current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w))); 1519 break; 1520 case FORMAT_G16R16: 1521 current.z = current.x; 1522 current.x = As<Short4>(UnpackLow(current.x, current.y)); 1523 current.z = As<Short4>(UnpackHigh(current.z, current.y)); 1524 current.y = current.z; 1525 break; 1526 case FORMAT_A16B16G16R16: 1527 transpose4x4(current.x, current.y, current.z, current.w); 1528 break; 1529 default: 1530 ASSERT(false); 1531 } 1532 1533 Short4 c01 = current.z; 1534 Short4 c23 = current.y; 1535 1536 Int xMask; // Combination of all masks 1537 1538 if(state.depthTestActive) 1539 { 1540 xMask = zMask; 1541 } 1542 else 1543 { 1544 xMask = cMask; 1545 } 1546 1547 if(state.stencilActive) 1548 { 1549 xMask &= sMask; 1550 } 1551 1552 switch(state.targetFormat[index]) 1553 { 1554 case FORMAT_R5G6B5: 1555 { 1556 Pointer<Byte> buffer = cBuffer + 2 * x; 1557 Int value = *Pointer<Int>(buffer); 1558 1559 Int c01 = Extract(As<Int2>(current.x), 0); 1560 1561 if((bgraWriteMask & 0x00000007) != 0x00000007) 1562 { 1563 Int masked = value; 1564 c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0])); 1565 masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0])); 1566 c01 |= masked; 1567 } 1568 1569 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8); 1570 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8); 1571 c01 |= value; 1572 *Pointer<Int>(buffer) = c01; 1573 1574 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1575 value = *Pointer<Int>(buffer); 1576 1577 Int c23 = Extract(As<Int2>(current.x), 1); 1578 1579 if((bgraWriteMask & 0x00000007) != 0x00000007) 1580 { 1581 Int masked = value; 1582 c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0])); 1583 masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0])); 1584 c23 |= masked; 1585 } 1586 1587 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8); 1588 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8); 1589 c23 |= value; 1590 *Pointer<Int>(buffer) = c23; 1591 } 1592 break; 1593 case FORMAT_A8G8R8B8Q: 1594 case FORMAT_X8G8R8B8Q: // FIXME: Don't touch alpha? 1595 UNIMPLEMENTED(); 1596 // value = *Pointer<Short4>(cBuffer + 8 * x + 0); 1597 1598 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) || 1599 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) && 1600 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh? 1601 // { 1602 // Short4 masked = value; 1603 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); 1604 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); 1605 // c01 |= masked; 1606 // } 1607 1608 // c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); 1609 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); 1610 // c01 |= value; 1611 // *Pointer<Short4>(cBuffer + 8 * x + 0) = c01; 1612 1613 // value = *Pointer<Short4>(cBuffer + 8 * x + 8); 1614 1615 // if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) || 1616 // ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) && 1617 // (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh? 1618 // { 1619 // Short4 masked = value; 1620 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); 1621 // masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); 1622 // c23 |= masked; 1623 // } 1624 1625 // c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); 1626 // value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); 1627 // c23 |= value; 1628 // *Pointer<Short4>(cBuffer + 8 * x + 8) = c23; 1629 break; 1630 case FORMAT_A8R8G8B8: 1631 case FORMAT_X8R8G8B8: // FIXME: Don't touch alpha? 1632 { 1633 Pointer<Byte> buffer = cBuffer + x * 4; 1634 Short4 value = *Pointer<Short4>(buffer); 1635 1636 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) || 1637 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) && 1638 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh? 1639 { 1640 Short4 masked = value; 1641 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); 1642 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); 1643 c01 |= masked; 1644 } 1645 1646 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); 1647 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); 1648 c01 |= value; 1649 *Pointer<Short4>(buffer) = c01; 1650 1651 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1652 value = *Pointer<Short4>(buffer); 1653 1654 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) || 1655 ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) && 1656 (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F))) // FIXME: Need for masking when XRGB && Fh? 1657 { 1658 Short4 masked = value; 1659 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0])); 1660 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0])); 1661 c23 |= masked; 1662 } 1663 1664 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); 1665 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); 1666 c23 |= value; 1667 *Pointer<Short4>(buffer) = c23; 1668 } 1669 break; 1670 case FORMAT_A8B8G8R8: 1671 case FORMAT_X8B8G8R8: // FIXME: Don't touch alpha? 1672 case FORMAT_SRGB8_X8: 1673 case FORMAT_SRGB8_A8: 1674 { 1675 Pointer<Byte> buffer = cBuffer + x * 4; 1676 Short4 value = *Pointer<Short4>(buffer); 1677 1678 bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) || 1679 (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) && 1680 ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh? 1681 1682 if(masked) 1683 { 1684 Short4 masked = value; 1685 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0])); 1686 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0])); 1687 c01 |= masked; 1688 } 1689 1690 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); 1691 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); 1692 c01 |= value; 1693 *Pointer<Short4>(buffer) = c01; 1694 1695 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1696 value = *Pointer<Short4>(buffer); 1697 1698 if(masked) 1699 { 1700 Short4 masked = value; 1701 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0])); 1702 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0])); 1703 c23 |= masked; 1704 } 1705 1706 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); 1707 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); 1708 c23 |= value; 1709 *Pointer<Short4>(buffer) = c23; 1710 } 1711 break; 1712 case FORMAT_A8: 1713 if(rgbaWriteMask & 0x00000008) 1714 { 1715 Pointer<Byte> buffer = cBuffer + 1 * x; 1716 Short4 value; 1717 Insert(value, *Pointer<Short>(buffer), 0); 1718 Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1719 Insert(value, *Pointer<Short>(buffer + pitch), 1); 1720 value = UnpackLow(As<Byte8>(value), As<Byte8>(value)); 1721 1722 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask); 1723 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask); 1724 current.w |= value; 1725 1726 *Pointer<Short>(buffer) = Extract(current.w, 0); 1727 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1); 1728 } 1729 break; 1730 case FORMAT_G16R16: 1731 { 1732 Pointer<Byte> buffer = cBuffer + 4 * x; 1733 1734 Short4 value = *Pointer<Short4>(buffer); 1735 1736 if((rgbaWriteMask & 0x00000003) != 0x00000003) 1737 { 1738 Short4 masked = value; 1739 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0])); 1740 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0])); 1741 current.x |= masked; 1742 } 1743 1744 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8); 1745 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8); 1746 current.x |= value; 1747 *Pointer<Short4>(buffer) = current.x; 1748 1749 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1750 1751 value = *Pointer<Short4>(buffer); 1752 1753 if((rgbaWriteMask & 0x00000003) != 0x00000003) 1754 { 1755 Short4 masked = value; 1756 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0])); 1757 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0])); 1758 current.y |= masked; 1759 } 1760 1761 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8); 1762 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8); 1763 current.y |= value; 1764 *Pointer<Short4>(buffer) = current.y; 1765 } 1766 break; 1767 case FORMAT_A16B16G16R16: 1768 { 1769 Pointer<Byte> buffer = cBuffer + 8 * x; 1770 1771 { 1772 Short4 value = *Pointer<Short4>(buffer); 1773 1774 if(rgbaWriteMask != 0x0000000F) 1775 { 1776 Short4 masked = value; 1777 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); 1778 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); 1779 current.x |= masked; 1780 } 1781 1782 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8); 1783 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8); 1784 current.x |= value; 1785 *Pointer<Short4>(buffer) = current.x; 1786 } 1787 1788 { 1789 Short4 value = *Pointer<Short4>(buffer + 8); 1790 1791 if(rgbaWriteMask != 0x0000000F) 1792 { 1793 Short4 masked = value; 1794 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); 1795 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); 1796 current.y |= masked; 1797 } 1798 1799 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8); 1800 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8); 1801 current.y |= value; 1802 *Pointer<Short4>(buffer + 8) = current.y; 1803 } 1804 1805 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 1806 1807 { 1808 Short4 value = *Pointer<Short4>(buffer); 1809 1810 if(rgbaWriteMask != 0x0000000F) 1811 { 1812 Short4 masked = value; 1813 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); 1814 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); 1815 current.z |= masked; 1816 } 1817 1818 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8); 1819 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8); 1820 current.z |= value; 1821 *Pointer<Short4>(buffer) = current.z; 1822 } 1823 1824 { 1825 Short4 value = *Pointer<Short4>(buffer + 8); 1826 1827 if(rgbaWriteMask != 0x0000000F) 1828 { 1829 Short4 masked = value; 1830 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0])); 1831 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0])); 1832 current.w |= masked; 1833 } 1834 1835 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8); 1836 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8); 1837 current.w |= value; 1838 *Pointer<Short4>(buffer + 8) = current.w; 1839 } 1840 } 1841 break; 1842 default: 1843 ASSERT(false); 1844 } 1845 } 1846 blendFactor(const Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorActive)1847 void PixelRoutine::blendFactor(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive) 1848 { 1849 switch(blendFactorActive) 1850 { 1851 case BLEND_ZERO: 1852 // Optimized 1853 break; 1854 case BLEND_ONE: 1855 // Optimized 1856 break; 1857 case BLEND_SOURCE: 1858 blendFactor.x = oC.x; 1859 blendFactor.y = oC.y; 1860 blendFactor.z = oC.z; 1861 break; 1862 case BLEND_INVSOURCE: 1863 blendFactor.x = Float4(1.0f) - oC.x; 1864 blendFactor.y = Float4(1.0f) - oC.y; 1865 blendFactor.z = Float4(1.0f) - oC.z; 1866 break; 1867 case BLEND_DEST: 1868 blendFactor.x = pixel.x; 1869 blendFactor.y = pixel.y; 1870 blendFactor.z = pixel.z; 1871 break; 1872 case BLEND_INVDEST: 1873 blendFactor.x = Float4(1.0f) - pixel.x; 1874 blendFactor.y = Float4(1.0f) - pixel.y; 1875 blendFactor.z = Float4(1.0f) - pixel.z; 1876 break; 1877 case BLEND_SOURCEALPHA: 1878 blendFactor.x = oC.w; 1879 blendFactor.y = oC.w; 1880 blendFactor.z = oC.w; 1881 break; 1882 case BLEND_INVSOURCEALPHA: 1883 blendFactor.x = Float4(1.0f) - oC.w; 1884 blendFactor.y = Float4(1.0f) - oC.w; 1885 blendFactor.z = Float4(1.0f) - oC.w; 1886 break; 1887 case BLEND_DESTALPHA: 1888 blendFactor.x = pixel.w; 1889 blendFactor.y = pixel.w; 1890 blendFactor.z = pixel.w; 1891 break; 1892 case BLEND_INVDESTALPHA: 1893 blendFactor.x = Float4(1.0f) - pixel.w; 1894 blendFactor.y = Float4(1.0f) - pixel.w; 1895 blendFactor.z = Float4(1.0f) - pixel.w; 1896 break; 1897 case BLEND_SRCALPHASAT: 1898 blendFactor.x = Float4(1.0f) - pixel.w; 1899 blendFactor.x = Min(blendFactor.x, oC.w); 1900 blendFactor.y = blendFactor.x; 1901 blendFactor.z = blendFactor.x; 1902 break; 1903 case BLEND_CONSTANT: 1904 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0])); 1905 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1])); 1906 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2])); 1907 break; 1908 case BLEND_INVCONSTANT: 1909 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0])); 1910 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1])); 1911 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2])); 1912 break; 1913 default: 1914 ASSERT(false); 1915 } 1916 } 1917 blendFactorAlpha(const Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorAlphaActive)1918 void PixelRoutine::blendFactorAlpha(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive) 1919 { 1920 switch(blendFactorAlphaActive) 1921 { 1922 case BLEND_ZERO: 1923 // Optimized 1924 break; 1925 case BLEND_ONE: 1926 // Optimized 1927 break; 1928 case BLEND_SOURCE: 1929 blendFactor.w = oC.w; 1930 break; 1931 case BLEND_INVSOURCE: 1932 blendFactor.w = Float4(1.0f) - oC.w; 1933 break; 1934 case BLEND_DEST: 1935 blendFactor.w = pixel.w; 1936 break; 1937 case BLEND_INVDEST: 1938 blendFactor.w = Float4(1.0f) - pixel.w; 1939 break; 1940 case BLEND_SOURCEALPHA: 1941 blendFactor.w = oC.w; 1942 break; 1943 case BLEND_INVSOURCEALPHA: 1944 blendFactor.w = Float4(1.0f) - oC.w; 1945 break; 1946 case BLEND_DESTALPHA: 1947 blendFactor.w = pixel.w; 1948 break; 1949 case BLEND_INVDESTALPHA: 1950 blendFactor.w = Float4(1.0f) - pixel.w; 1951 break; 1952 case BLEND_SRCALPHASAT: 1953 blendFactor.w = Float4(1.0f); 1954 break; 1955 case BLEND_CONSTANT: 1956 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3])); 1957 break; 1958 case BLEND_INVCONSTANT: 1959 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3])); 1960 break; 1961 default: 1962 ASSERT(false); 1963 } 1964 } 1965 alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4f & oC,Int & x)1966 void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x) 1967 { 1968 if(!state.alphaBlendActive) 1969 { 1970 return; 1971 } 1972 1973 Pointer<Byte> buffer; 1974 Vector4f pixel; 1975 1976 Vector4s color; 1977 Short4 c01; 1978 Short4 c23; 1979 1980 Float4 one; 1981 switch(state.targetFormat[index]) 1982 { 1983 case FORMAT_R32I: 1984 case FORMAT_G32R32I: 1985 one = As<Float4>(Int4(0x7FFFFFFF)); 1986 break; 1987 case FORMAT_R32UI: 1988 case FORMAT_G32R32UI: 1989 one = As<Float4>(Int4(0xFFFFFFFF)); 1990 break; 1991 case FORMAT_R32F: 1992 case FORMAT_G32R32F: 1993 one = Float4(1.0f); 1994 break; 1995 } 1996 1997 switch(state.targetFormat[index]) 1998 { 1999 case FORMAT_R32I: 2000 case FORMAT_R32UI: 2001 case FORMAT_R32F: 2002 buffer = cBuffer; 2003 // FIXME: movlps 2004 pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0); 2005 pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4); 2006 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2007 // FIXME: movhps 2008 pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0); 2009 pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4); 2010 pixel.y = pixel.z = pixel.w = one; 2011 break; 2012 case FORMAT_G32R32I: 2013 case FORMAT_G32R32UI: 2014 case FORMAT_G32R32F: 2015 buffer = cBuffer; 2016 pixel.x = *Pointer<Float4>(buffer + 8 * x, 16); 2017 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2018 pixel.y = *Pointer<Float4>(buffer + 8 * x, 16); 2019 pixel.z = pixel.x; 2020 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88); 2021 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD); 2022 pixel.y = pixel.z; 2023 pixel.z = pixel.w = one; 2024 break; 2025 case FORMAT_X32B32G32R32F: 2026 case FORMAT_A32B32G32R32F: 2027 case FORMAT_A32B32G32R32I: 2028 case FORMAT_A32B32G32R32UI: 2029 buffer = cBuffer; 2030 pixel.x = *Pointer<Float4>(buffer + 16 * x, 16); 2031 pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16); 2032 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2033 pixel.z = *Pointer<Float4>(buffer + 16 * x, 16); 2034 pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16); 2035 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w); 2036 if(state.targetFormat[index] == FORMAT_X32B32G32R32F) 2037 { 2038 pixel.w = Float4(1.0f); 2039 } 2040 break; 2041 default: 2042 ASSERT(false); 2043 } 2044 2045 if((postBlendSRGB && state.writeSRGB) || isSRGB(index)) 2046 { 2047 sRGBtoLinear(pixel.x); 2048 sRGBtoLinear(pixel.y); 2049 sRGBtoLinear(pixel.z); 2050 } 2051 2052 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor 2053 Vector4f sourceFactor; 2054 Vector4f destFactor; 2055 2056 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor); 2057 blendFactor(destFactor, oC, pixel, state.destBlendFactor); 2058 2059 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO) 2060 { 2061 oC.x *= sourceFactor.x; 2062 oC.y *= sourceFactor.y; 2063 oC.z *= sourceFactor.z; 2064 } 2065 2066 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO) 2067 { 2068 pixel.x *= destFactor.x; 2069 pixel.y *= destFactor.y; 2070 pixel.z *= destFactor.z; 2071 } 2072 2073 switch(state.blendOperation) 2074 { 2075 case BLENDOP_ADD: 2076 oC.x += pixel.x; 2077 oC.y += pixel.y; 2078 oC.z += pixel.z; 2079 break; 2080 case BLENDOP_SUB: 2081 oC.x -= pixel.x; 2082 oC.y -= pixel.y; 2083 oC.z -= pixel.z; 2084 break; 2085 case BLENDOP_INVSUB: 2086 oC.x = pixel.x - oC.x; 2087 oC.y = pixel.y - oC.y; 2088 oC.z = pixel.z - oC.z; 2089 break; 2090 case BLENDOP_MIN: 2091 oC.x = Min(oC.x, pixel.x); 2092 oC.y = Min(oC.y, pixel.y); 2093 oC.z = Min(oC.z, pixel.z); 2094 break; 2095 case BLENDOP_MAX: 2096 oC.x = Max(oC.x, pixel.x); 2097 oC.y = Max(oC.y, pixel.y); 2098 oC.z = Max(oC.z, pixel.z); 2099 break; 2100 case BLENDOP_SOURCE: 2101 // No operation 2102 break; 2103 case BLENDOP_DEST: 2104 oC.x = pixel.x; 2105 oC.y = pixel.y; 2106 oC.z = pixel.z; 2107 break; 2108 case BLENDOP_NULL: 2109 oC.x = Float4(0.0f); 2110 oC.y = Float4(0.0f); 2111 oC.z = Float4(0.0f); 2112 break; 2113 default: 2114 ASSERT(false); 2115 } 2116 2117 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha); 2118 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha); 2119 2120 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO) 2121 { 2122 oC.w *= sourceFactor.w; 2123 } 2124 2125 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO) 2126 { 2127 pixel.w *= destFactor.w; 2128 } 2129 2130 switch(state.blendOperationAlpha) 2131 { 2132 case BLENDOP_ADD: 2133 oC.w += pixel.w; 2134 break; 2135 case BLENDOP_SUB: 2136 oC.w -= pixel.w; 2137 break; 2138 case BLENDOP_INVSUB: 2139 pixel.w -= oC.w; 2140 oC.w = pixel.w; 2141 break; 2142 case BLENDOP_MIN: 2143 oC.w = Min(oC.w, pixel.w); 2144 break; 2145 case BLENDOP_MAX: 2146 oC.w = Max(oC.w, pixel.w); 2147 break; 2148 case BLENDOP_SOURCE: 2149 // No operation 2150 break; 2151 case BLENDOP_DEST: 2152 oC.w = pixel.w; 2153 break; 2154 case BLENDOP_NULL: 2155 oC.w = Float4(0.0f); 2156 break; 2157 default: 2158 ASSERT(false); 2159 } 2160 } 2161 writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4f & oC,Int & sMask,Int & zMask,Int & cMask)2162 void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask) 2163 { 2164 switch(state.targetFormat[index]) 2165 { 2166 case FORMAT_R32F: 2167 case FORMAT_R32I: 2168 case FORMAT_R32UI: 2169 break; 2170 case FORMAT_G32R32F: 2171 case FORMAT_G32R32I: 2172 case FORMAT_G32R32UI: 2173 oC.z = oC.x; 2174 oC.x = UnpackLow(oC.x, oC.y); 2175 oC.z = UnpackHigh(oC.z, oC.y); 2176 oC.y = oC.z; 2177 break; 2178 case FORMAT_X32B32G32R32F: 2179 case FORMAT_A32B32G32R32F: 2180 case FORMAT_A32B32G32R32I: 2181 case FORMAT_A32B32G32R32UI: 2182 transpose4x4(oC.x, oC.y, oC.z, oC.w); 2183 break; 2184 default: 2185 ASSERT(false); 2186 } 2187 2188 int rgbaWriteMask = state.colorWriteActive(index); 2189 2190 Int xMask; // Combination of all masks 2191 2192 if(state.depthTestActive) 2193 { 2194 xMask = zMask; 2195 } 2196 else 2197 { 2198 xMask = cMask; 2199 } 2200 2201 if(state.stencilActive) 2202 { 2203 xMask &= sMask; 2204 } 2205 2206 Pointer<Byte> buffer; 2207 Float4 value; 2208 2209 switch(state.targetFormat[index]) 2210 { 2211 case FORMAT_R32F: 2212 case FORMAT_R32I: 2213 case FORMAT_R32UI: 2214 if(rgbaWriteMask & 0x00000001) 2215 { 2216 buffer = cBuffer + 4 * x; 2217 2218 // FIXME: movlps 2219 value.x = *Pointer<Float>(buffer + 0); 2220 value.y = *Pointer<Float>(buffer + 4); 2221 2222 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2223 2224 // FIXME: movhps 2225 value.z = *Pointer<Float>(buffer + 0); 2226 value.w = *Pointer<Float>(buffer + 4); 2227 2228 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16)); 2229 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16)); 2230 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); 2231 2232 // FIXME: movhps 2233 *Pointer<Float>(buffer + 0) = oC.x.z; 2234 *Pointer<Float>(buffer + 4) = oC.x.w; 2235 2236 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2237 2238 // FIXME: movlps 2239 *Pointer<Float>(buffer + 0) = oC.x.x; 2240 *Pointer<Float>(buffer + 4) = oC.x.y; 2241 } 2242 break; 2243 case FORMAT_G32R32F: 2244 case FORMAT_G32R32I: 2245 case FORMAT_G32R32UI: 2246 buffer = cBuffer + 8 * x; 2247 2248 value = *Pointer<Float4>(buffer); 2249 2250 if((rgbaWriteMask & 0x00000003) != 0x00000003) 2251 { 2252 Float4 masked = value; 2253 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0]))); 2254 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0]))); 2255 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked)); 2256 } 2257 2258 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16)); 2259 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16)); 2260 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); 2261 *Pointer<Float4>(buffer) = oC.x; 2262 2263 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2264 2265 value = *Pointer<Float4>(buffer); 2266 2267 if((rgbaWriteMask & 0x00000003) != 0x00000003) 2268 { 2269 Float4 masked; 2270 2271 masked = value; 2272 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0]))); 2273 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0]))); 2274 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked)); 2275 } 2276 2277 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16)); 2278 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16)); 2279 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value)); 2280 *Pointer<Float4>(buffer) = oC.y; 2281 break; 2282 case FORMAT_X32B32G32R32F: 2283 case FORMAT_A32B32G32R32F: 2284 case FORMAT_A32B32G32R32I: 2285 case FORMAT_A32B32G32R32UI: 2286 buffer = cBuffer + 16 * x; 2287 2288 { 2289 value = *Pointer<Float4>(buffer, 16); 2290 2291 if(rgbaWriteMask != 0x0000000F) 2292 { 2293 Float4 masked = value; 2294 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); 2295 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); 2296 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked)); 2297 } 2298 2299 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16)); 2300 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16)); 2301 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); 2302 *Pointer<Float4>(buffer, 16) = oC.x; 2303 } 2304 2305 { 2306 value = *Pointer<Float4>(buffer + 16, 16); 2307 2308 if(rgbaWriteMask != 0x0000000F) 2309 { 2310 Float4 masked = value; 2311 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); 2312 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); 2313 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked)); 2314 } 2315 2316 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16)); 2317 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16)); 2318 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value)); 2319 *Pointer<Float4>(buffer + 16, 16) = oC.y; 2320 } 2321 2322 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])); 2323 2324 { 2325 value = *Pointer<Float4>(buffer, 16); 2326 2327 if(rgbaWriteMask != 0x0000000F) 2328 { 2329 Float4 masked = value; 2330 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); 2331 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); 2332 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked)); 2333 } 2334 2335 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16)); 2336 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16)); 2337 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value)); 2338 *Pointer<Float4>(buffer, 16) = oC.z; 2339 } 2340 2341 { 2342 value = (state.targetFormat[index] == FORMAT_X32B32G32R32F) ? Float4(1.0f) : *Pointer<Float4>(buffer + 16, 16); 2343 2344 if(rgbaWriteMask != 0x0000000F) 2345 { 2346 Float4 masked = value; 2347 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); 2348 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); 2349 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked)); 2350 } 2351 2352 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16)); 2353 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16)); 2354 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value)); 2355 *Pointer<Float4>(buffer + 16, 16) = oC.w; 2356 } 2357 break; 2358 default: 2359 ASSERT(false); 2360 } 2361 } 2362 convertFixed16(Float4 & cf,bool saturate)2363 UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate) 2364 { 2365 return UShort4(cf * Float4(0xFFFF), saturate); 2366 } 2367 sRGBtoLinear16_12_16(Vector4s & c)2368 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c) 2369 { 2370 c.x = As<UShort4>(c.x) >> 4; 2371 c.y = As<UShort4>(c.y) >> 4; 2372 c.z = As<UShort4>(c.z) >> 4; 2373 2374 sRGBtoLinear12_16(c); 2375 } 2376 sRGBtoLinear12_16(Vector4s & c)2377 void PixelRoutine::sRGBtoLinear12_16(Vector4s &c) 2378 { 2379 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16); 2380 2381 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0); 2382 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1); 2383 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2); 2384 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3); 2385 2386 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0); 2387 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1); 2388 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2); 2389 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3); 2390 2391 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0); 2392 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1); 2393 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2); 2394 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3); 2395 } 2396 linearToSRGB16_12_16(Vector4s & c)2397 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c) 2398 { 2399 c.x = As<UShort4>(c.x) >> 4; 2400 c.y = As<UShort4>(c.y) >> 4; 2401 c.z = As<UShort4>(c.z) >> 4; 2402 2403 linearToSRGB12_16(c); 2404 } 2405 linearToSRGB12_16(Vector4s & c)2406 void PixelRoutine::linearToSRGB12_16(Vector4s &c) 2407 { 2408 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16); 2409 2410 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0); 2411 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1); 2412 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2); 2413 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3); 2414 2415 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0); 2416 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1); 2417 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2); 2418 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3); 2419 2420 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0); 2421 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1); 2422 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2); 2423 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3); 2424 } 2425 sRGBtoLinear(const Float4 & x)2426 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2 2427 { 2428 Float4 linear = x * x; 2429 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f); 2430 2431 return Min(Max(linear, Float4(0.0f)), Float4(1.0f)); 2432 } 2433 colorUsed()2434 bool PixelRoutine::colorUsed() 2435 { 2436 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill; 2437 } 2438 } 2439