1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file blend_jit.cpp
24 *
25 * @brief Implementation of the blend jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "jit_api.h"
33 #include "blend_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
36
37 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
38 #define QUANTIZE_THRESHOLD 2
39
40 using namespace llvm;
41 using namespace SwrJit;
42
43 //////////////////////////////////////////////////////////////////////////
44 /// Interface to Jitting a blend shader
45 //////////////////////////////////////////////////////////////////////////
46 struct BlendJit : public Builder
47 {
BlendJitBlendJit48 BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
49
50 template <bool Color, bool Alpha>
GenerateBlendFactorBlendJit51 void GenerateBlendFactor(SWR_BLEND_FACTOR factor,
52 Value* constColor[4],
53 Value* src[4],
54 Value* src1[4],
55 Value* dst[4],
56 Value* result[4])
57 {
58 Value* out[4];
59
60 switch (factor)
61 {
62 case BLENDFACTOR_ONE:
63 out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
64 break;
65 case BLENDFACTOR_SRC_COLOR:
66 out[0] = src[0];
67 out[1] = src[1];
68 out[2] = src[2];
69 out[3] = src[3];
70 break;
71 case BLENDFACTOR_SRC_ALPHA:
72 out[0] = out[1] = out[2] = out[3] = src[3];
73 break;
74 case BLENDFACTOR_DST_ALPHA:
75 out[0] = out[1] = out[2] = out[3] = dst[3];
76 break;
77 case BLENDFACTOR_DST_COLOR:
78 out[0] = dst[0];
79 out[1] = dst[1];
80 out[2] = dst[2];
81 out[3] = dst[3];
82 break;
83 case BLENDFACTOR_SRC_ALPHA_SATURATE:
84 out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
85 out[3] = VIMMED1(1.0f);
86 break;
87 case BLENDFACTOR_CONST_COLOR:
88 out[0] = constColor[0];
89 out[1] = constColor[1];
90 out[2] = constColor[2];
91 out[3] = constColor[3];
92 break;
93 case BLENDFACTOR_CONST_ALPHA:
94 out[0] = out[1] = out[2] = out[3] = constColor[3];
95 break;
96 case BLENDFACTOR_SRC1_COLOR:
97 out[0] = src1[0];
98 out[1] = src1[1];
99 out[2] = src1[2];
100 out[3] = src1[3];
101 break;
102 case BLENDFACTOR_SRC1_ALPHA:
103 out[0] = out[1] = out[2] = out[3] = src1[3];
104 break;
105 case BLENDFACTOR_ZERO:
106 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
107 break;
108 case BLENDFACTOR_INV_SRC_COLOR:
109 out[0] = FSUB(VIMMED1(1.0f), src[0]);
110 out[1] = FSUB(VIMMED1(1.0f), src[1]);
111 out[2] = FSUB(VIMMED1(1.0f), src[2]);
112 out[3] = FSUB(VIMMED1(1.0f), src[3]);
113 break;
114 case BLENDFACTOR_INV_SRC_ALPHA:
115 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
116 break;
117 case BLENDFACTOR_INV_DST_ALPHA:
118 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
119 break;
120 case BLENDFACTOR_INV_DST_COLOR:
121 out[0] = FSUB(VIMMED1(1.0f), dst[0]);
122 out[1] = FSUB(VIMMED1(1.0f), dst[1]);
123 out[2] = FSUB(VIMMED1(1.0f), dst[2]);
124 out[3] = FSUB(VIMMED1(1.0f), dst[3]);
125 break;
126 case BLENDFACTOR_INV_CONST_COLOR:
127 out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
128 out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
129 out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
130 out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
131 break;
132 case BLENDFACTOR_INV_CONST_ALPHA:
133 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
134 break;
135 case BLENDFACTOR_INV_SRC1_COLOR:
136 out[0] = FSUB(VIMMED1(1.0f), src1[0]);
137 out[1] = FSUB(VIMMED1(1.0f), src1[1]);
138 out[2] = FSUB(VIMMED1(1.0f), src1[2]);
139 out[3] = FSUB(VIMMED1(1.0f), src1[3]);
140 break;
141 case BLENDFACTOR_INV_SRC1_ALPHA:
142 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
143 break;
144 default:
145 SWR_INVALID("Unsupported blend factor: %d", factor);
146 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
147 break;
148 }
149
150 if (Color)
151 {
152 result[0] = out[0];
153 result[1] = out[1];
154 result[2] = out[2];
155 }
156
157 if (Alpha)
158 {
159 result[3] = out[3];
160 }
161 }
162
ClampBlendJit163 void Clamp(SWR_FORMAT format, Value* src[4])
164 {
165 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
166 SWR_TYPE type = info.type[0];
167
168 switch (type)
169 {
170 default:
171 break;
172
173 case SWR_TYPE_UNORM:
174 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
175 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
176 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
177 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
178 break;
179
180 case SWR_TYPE_SNORM:
181 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
182 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
183 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
184 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
185 break;
186
187 case SWR_TYPE_UNKNOWN:
188 SWR_INVALID("Unsupport format type: %d", type);
189 }
190 }
191
ApplyDefaultsBlendJit192 void ApplyDefaults(SWR_FORMAT format, Value* src[4])
193 {
194 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
195
196 bool valid[] = {false, false, false, false};
197 for (uint32_t c = 0; c < info.numComps; ++c)
198 {
199 valid[info.swizzle[c]] = true;
200 }
201
202 for (uint32_t c = 0; c < 4; ++c)
203 {
204 if (!valid[c])
205 {
206 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
207 }
208 }
209 }
210
ApplyUnusedDefaultsBlendJit211 void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
212 {
213 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
214
215 for (uint32_t c = 0; c < info.numComps; ++c)
216 {
217 if (info.type[c] == SWR_TYPE_UNUSED)
218 {
219 src[info.swizzle[c]] =
220 BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
221 }
222 }
223 }
224
QuantizeBlendJit225 void Quantize(SWR_FORMAT format, Value* src[4])
226 {
227 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
228 for (uint32_t c = 0; c < info.numComps; ++c)
229 {
230 if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
231 {
232 uint32_t swizComp = info.swizzle[c];
233 float factor = (float)((1 << info.bpc[c]) - 1);
234 switch (info.type[c])
235 {
236 case SWR_TYPE_UNORM:
237 src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
238 src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
239 src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor));
240 break;
241 default:
242 SWR_INVALID("Unsupported format type: %d", info.type[c]);
243 }
244 }
245 }
246 }
247
248 template <bool Color, bool Alpha>
BlendFuncBlendJit249 void BlendFunc(SWR_BLEND_OP blendOp,
250 Value* src[4],
251 Value* srcFactor[4],
252 Value* dst[4],
253 Value* dstFactor[4],
254 Value* result[4])
255 {
256 Value* out[4];
257 Value* srcBlend[4];
258 Value* dstBlend[4];
259 for (uint32_t i = 0; i < 4; ++i)
260 {
261 srcBlend[i] = FMUL(src[i], srcFactor[i]);
262 dstBlend[i] = FMUL(dst[i], dstFactor[i]);
263 }
264
265 switch (blendOp)
266 {
267 case BLENDOP_ADD:
268 out[0] = FADD(srcBlend[0], dstBlend[0]);
269 out[1] = FADD(srcBlend[1], dstBlend[1]);
270 out[2] = FADD(srcBlend[2], dstBlend[2]);
271 out[3] = FADD(srcBlend[3], dstBlend[3]);
272 break;
273
274 case BLENDOP_SUBTRACT:
275 out[0] = FSUB(srcBlend[0], dstBlend[0]);
276 out[1] = FSUB(srcBlend[1], dstBlend[1]);
277 out[2] = FSUB(srcBlend[2], dstBlend[2]);
278 out[3] = FSUB(srcBlend[3], dstBlend[3]);
279 break;
280
281 case BLENDOP_REVSUBTRACT:
282 out[0] = FSUB(dstBlend[0], srcBlend[0]);
283 out[1] = FSUB(dstBlend[1], srcBlend[1]);
284 out[2] = FSUB(dstBlend[2], srcBlend[2]);
285 out[3] = FSUB(dstBlend[3], srcBlend[3]);
286 break;
287
288 case BLENDOP_MIN:
289 out[0] = VMINPS(src[0], dst[0]);
290 out[1] = VMINPS(src[1], dst[1]);
291 out[2] = VMINPS(src[2], dst[2]);
292 out[3] = VMINPS(src[3], dst[3]);
293 break;
294
295 case BLENDOP_MAX:
296 out[0] = VMAXPS(src[0], dst[0]);
297 out[1] = VMAXPS(src[1], dst[1]);
298 out[2] = VMAXPS(src[2], dst[2]);
299 out[3] = VMAXPS(src[3], dst[3]);
300 break;
301
302 default:
303 SWR_INVALID("Unsupported blend operation: %d", blendOp);
304 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
305 break;
306 }
307
308 if (Color)
309 {
310 result[0] = out[0];
311 result[1] = out[1];
312 result[2] = out[2];
313 }
314
315 if (Alpha)
316 {
317 result[3] = out[3];
318 }
319 }
320
LogicOpFuncBlendJit321 void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
322 {
323 // Op: (s == PS output, d = RT contents)
324 switch (logicOp)
325 {
326 case LOGICOP_CLEAR:
327 result[0] = VIMMED1(0);
328 result[1] = VIMMED1(0);
329 result[2] = VIMMED1(0);
330 result[3] = VIMMED1(0);
331 break;
332
333 case LOGICOP_NOR:
334 // ~(s | d)
335 result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
336 result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
337 result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
338 result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
339 break;
340
341 case LOGICOP_AND_INVERTED:
342 // ~s & d
343 // todo: use avx andnot instr when I can find the intrinsic to call
344 result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
345 result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
346 result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
347 result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
348 break;
349
350 case LOGICOP_COPY_INVERTED:
351 // ~s
352 result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
353 result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
354 result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
355 result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
356 break;
357
358 case LOGICOP_AND_REVERSE:
359 // s & ~d
360 // todo: use avx andnot instr when I can find the intrinsic to call
361 result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
362 result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
363 result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
364 result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
365 break;
366
367 case LOGICOP_INVERT:
368 // ~d
369 result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
370 result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
371 result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
372 result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
373 break;
374
375 case LOGICOP_XOR:
376 // s ^ d
377 result[0] = XOR(src[0], dst[0]);
378 result[1] = XOR(src[1], dst[1]);
379 result[2] = XOR(src[2], dst[2]);
380 result[3] = XOR(src[3], dst[3]);
381 break;
382
383 case LOGICOP_NAND:
384 // ~(s & d)
385 result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
386 result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
387 result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
388 result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
389 break;
390
391 case LOGICOP_AND:
392 // s & d
393 result[0] = AND(src[0], dst[0]);
394 result[1] = AND(src[1], dst[1]);
395 result[2] = AND(src[2], dst[2]);
396 result[3] = AND(src[3], dst[3]);
397 break;
398
399 case LOGICOP_EQUIV:
400 // ~(s ^ d)
401 result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
402 result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
403 result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
404 result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
405 break;
406
407 case LOGICOP_NOOP:
408 result[0] = dst[0];
409 result[1] = dst[1];
410 result[2] = dst[2];
411 result[3] = dst[3];
412 break;
413
414 case LOGICOP_OR_INVERTED:
415 // ~s | d
416 result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
417 result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
418 result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
419 result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
420 break;
421
422 case LOGICOP_COPY:
423 result[0] = src[0];
424 result[1] = src[1];
425 result[2] = src[2];
426 result[3] = src[3];
427 break;
428
429 case LOGICOP_OR_REVERSE:
430 // s | ~d
431 result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
432 result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
433 result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
434 result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
435 break;
436
437 case LOGICOP_OR:
438 // s | d
439 result[0] = OR(src[0], dst[0]);
440 result[1] = OR(src[1], dst[1]);
441 result[2] = OR(src[2], dst[2]);
442 result[3] = OR(src[3], dst[3]);
443 break;
444
445 case LOGICOP_SET:
446 result[0] = VIMMED1(0xFFFFFFFF);
447 result[1] = VIMMED1(0xFFFFFFFF);
448 result[2] = VIMMED1(0xFFFFFFFF);
449 result[3] = VIMMED1(0xFFFFFFFF);
450 break;
451
452 default:
453 SWR_INVALID("Unsupported logic operation: %d", logicOp);
454 result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
455 break;
456 }
457 }
458
459 void
AlphaTestBlendJit460 AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
461 {
462 // load uint32_t reference
463 Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference}));
464
465 // load alpha
466 Value* pAlpha = LOAD(ppAlpha, {0, 0});
467
468 Value* pTest = nullptr;
469 if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
470 {
471 // convert float alpha to unorm8
472 Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
473 pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
474
475 // compare
476 switch (state.alphaTestFunction)
477 {
478 case ZFUNC_ALWAYS:
479 pTest = VIMMED1(true);
480 break;
481 case ZFUNC_NEVER:
482 pTest = VIMMED1(false);
483 break;
484 case ZFUNC_LT:
485 pTest = ICMP_ULT(pAlphaU8, pRef);
486 break;
487 case ZFUNC_EQ:
488 pTest = ICMP_EQ(pAlphaU8, pRef);
489 break;
490 case ZFUNC_LE:
491 pTest = ICMP_ULE(pAlphaU8, pRef);
492 break;
493 case ZFUNC_GT:
494 pTest = ICMP_UGT(pAlphaU8, pRef);
495 break;
496 case ZFUNC_NE:
497 pTest = ICMP_NE(pAlphaU8, pRef);
498 break;
499 case ZFUNC_GE:
500 pTest = ICMP_UGE(pAlphaU8, pRef);
501 break;
502 default:
503 SWR_INVALID("Invalid alpha test function");
504 break;
505 }
506 }
507 else
508 {
509 // cast ref to float
510 pRef = BITCAST(pRef, mSimdFP32Ty);
511
512 // compare
513 switch (state.alphaTestFunction)
514 {
515 case ZFUNC_ALWAYS:
516 pTest = VIMMED1(true);
517 break;
518 case ZFUNC_NEVER:
519 pTest = VIMMED1(false);
520 break;
521 case ZFUNC_LT:
522 pTest = FCMP_OLT(pAlpha, pRef);
523 break;
524 case ZFUNC_EQ:
525 pTest = FCMP_OEQ(pAlpha, pRef);
526 break;
527 case ZFUNC_LE:
528 pTest = FCMP_OLE(pAlpha, pRef);
529 break;
530 case ZFUNC_GT:
531 pTest = FCMP_OGT(pAlpha, pRef);
532 break;
533 case ZFUNC_NE:
534 pTest = FCMP_ONE(pAlpha, pRef);
535 break;
536 case ZFUNC_GE:
537 pTest = FCMP_OGE(pAlpha, pRef);
538 break;
539 default:
540 SWR_INVALID("Invalid alpha test function");
541 break;
542 }
543 }
544
545 // load current mask
546 Value* pMask = LOAD(ppMask);
547
548 // convert to int1 mask
549 pMask = MASK(pMask);
550
551 // and with alpha test result
552 pMask = AND(pMask, pTest);
553
554 // convert back to vector mask
555 pMask = VMASK(pMask);
556
557 // store new mask
558 STORE(pMask, ppMask);
559 }
560
CreateBlendJit561 Function* Create(const BLEND_COMPILE_STATE& state)
562 {
563 std::stringstream fnName("BLND_",
564 std::ios_base::in | std::ios_base::out | std::ios_base::ate);
565 fnName << ComputeCRC(0, &state, sizeof(state));
566
567 // blend function signature
568 // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*);
569
570 std::vector<Type*> args{
571 PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT*
572 };
573
574 // std::vector<Type*> args{
575 // PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT*
576 //};
577
578 FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
579 Function* blendFunc = Function::Create(
580 fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
581 blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
582
583 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
584
585 IRB()->SetInsertPoint(entry);
586
587 // arguments
588 auto argitr = blendFunc->arg_begin();
589 Value* pBlendContext = &*argitr++;
590 pBlendContext->setName("pBlendContext");
591 Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState});
592 pBlendState->setName("pBlendState");
593 Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src});
594 pSrc->setName("src");
595 Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1});
596 pSrc1->setName("src1");
597 Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha});
598 pSrc0Alpha->setName("src0alpha");
599 Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum});
600 sampleNum->setName("sampleNum");
601 Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst});
602 pDst->setName("pDst");
603 Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result});
604 pResult->setName("result");
605 Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask});
606 ppoMask->setName("ppoMask");
607 Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask});
608 ppMask->setName("pMask");
609
610 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
611 "Unsupported hot tile format");
612 Value* dst[4];
613 Value* constantColor[4];
614 Value* src[4];
615 Value* src1[4];
616 Value* result[4];
617 for (uint32_t i = 0; i < 4; ++i)
618 {
619 // load hot tile
620 dst[i] = LOAD(pDst, {0, i});
621
622 // load constant color
623 constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i}));
624
625 // load src
626 src[i] = LOAD(pSrc, {0, i});
627
628 // load src1
629 src1[i] = LOAD(pSrc1, {0, i});
630 }
631 Value* currentSampleMask = VIMMED1(-1);
632 if (state.desc.alphaToCoverageEnable)
633 {
634 Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
635 uint32_t bits = (1 << state.desc.numSamples) - 1;
636 currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
637 currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
638 }
639
640 // alpha test
641 if (state.desc.alphaTestEnable)
642 {
643 // Gather for archrast stats
644 STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
645 AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
646 }
647 else
648 {
649 // Gather for archrast stats
650 STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
651 }
652
653 // color blend
654 if (state.blendState.blendEnable)
655 {
656 // Gather for archrast stats
657 STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
658
659 // clamp sources
660 Clamp(state.format, src);
661 Clamp(state.format, src1);
662 Clamp(state.format, dst);
663 Clamp(state.format, constantColor);
664
665 // apply defaults to hottile contents to take into account missing components
666 ApplyDefaults(state.format, dst);
667
668 // Force defaults for unused 'X' components
669 ApplyUnusedDefaults(state.format, dst);
670
671 // Quantize low precision components
672 Quantize(state.format, dst);
673
674 // special case clamping for R11G11B10_float which has no sign bit
675 if (state.format == R11G11B10_FLOAT)
676 {
677 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
678 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
679 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
680 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
681 }
682
683 Value* srcFactor[4];
684 Value* dstFactor[4];
685 if (state.desc.independentAlphaBlendEnable)
686 {
687 GenerateBlendFactor<true, false>(
688 state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
689 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor,
690 constantColor,
691 src,
692 src1,
693 dst,
694 srcFactor);
695
696 GenerateBlendFactor<true, false>(
697 state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
698 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor,
699 constantColor,
700 src,
701 src1,
702 dst,
703 dstFactor);
704
705 BlendFunc<true, false>(
706 state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
707 BlendFunc<false, true>(
708 state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
709 }
710 else
711 {
712 GenerateBlendFactor<true, true>(
713 state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
714 GenerateBlendFactor<true, true>(
715 state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
716
717 BlendFunc<true, true>(
718 state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
719 }
720
721 // store results out
722 for (uint32_t i = 0; i < 4; ++i)
723 {
724 STORE(result[i], pResult, {0, i});
725 }
726 }
727 else
728 {
729 // Gather for archrast stats
730 STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
731 }
732
733 if (state.blendState.logicOpEnable)
734 {
735 const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
736 Value* vMask[4];
737 float scale[4];
738
739 if (!state.blendState.blendEnable)
740 {
741 Clamp(state.format, src);
742 Clamp(state.format, dst);
743 }
744
745 for (uint32_t i = 0; i < 4; i++)
746 {
747 if (info.type[i] == SWR_TYPE_UNUSED)
748 {
749 continue;
750 }
751
752 if (info.bpc[i] >= 32)
753 {
754 vMask[i] = VIMMED1(0xFFFFFFFF);
755 scale[i] = 0xFFFFFFFF;
756 }
757 else
758 {
759 vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
760 if (info.type[i] == SWR_TYPE_SNORM)
761 scale[i] = (1 << (info.bpc[i] - 1)) - 1;
762 else
763 scale[i] = (1 << info.bpc[i]) - 1;
764 }
765
766 switch (info.type[i])
767 {
768 default:
769 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
770 break;
771
772 case SWR_TYPE_UNKNOWN:
773 case SWR_TYPE_UNUSED:
774 // fallthrough
775
776 case SWR_TYPE_UINT:
777 case SWR_TYPE_SINT:
778 src[i] = BITCAST(src[i], mSimdInt32Ty);
779 dst[i] = BITCAST(dst[i], mSimdInt32Ty);
780 break;
781 case SWR_TYPE_SNORM:
782 src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
783 dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
784 break;
785 case SWR_TYPE_UNORM:
786 src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
787 dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
788 break;
789 }
790 }
791
792 LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
793
794 // store results out
795 for (uint32_t i = 0; i < 4; ++i)
796 {
797 if (info.type[i] == SWR_TYPE_UNUSED)
798 {
799 continue;
800 }
801
802 // clear upper bits from PS output not in RT format after doing logic op
803 result[i] = AND(result[i], vMask[i]);
804
805 switch (info.type[i])
806 {
807 default:
808 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
809 break;
810
811 case SWR_TYPE_UNKNOWN:
812 case SWR_TYPE_UNUSED:
813 // fallthrough
814
815 case SWR_TYPE_UINT:
816 case SWR_TYPE_SINT:
817 result[i] = BITCAST(result[i], mSimdFP32Ty);
818 break;
819 case SWR_TYPE_SNORM:
820 result[i] = SHL(result[i], C(32 - info.bpc[i]));
821 result[i] = ASHR(result[i], C(32 - info.bpc[i]));
822 result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
823 break;
824 case SWR_TYPE_UNORM:
825 result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
826 break;
827 }
828
829 STORE(result[i], pResult, {0, i});
830 }
831 }
832
833 if (state.desc.oMaskEnable)
834 {
835 assert(!(state.desc.alphaToCoverageEnable));
836 // load current mask
837 Value* oMask = LOAD(ppoMask);
838 currentSampleMask = AND(oMask, currentSampleMask);
839 }
840
841 if (state.desc.sampleMaskEnable)
842 {
843 Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask});
844 currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
845 }
846
847 if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
848 state.desc.oMaskEnable)
849 {
850 // load coverage mask and mask off any lanes with no samples
851 Value* pMask = LOAD(ppMask);
852 Value* sampleMasked = SHL(C(1), sampleNum);
853 currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked));
854 currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
855 Value* outputMask = AND(pMask, currentSampleMask);
856 // store new mask
857 STORE(outputMask, GEP(ppMask, C(0)));
858 }
859
860 RET_VOID();
861
862 JitManager::DumpToFile(blendFunc, "");
863
864 ::FunctionPassManager passes(JM()->mpCurrentModule);
865
866 passes.add(createBreakCriticalEdgesPass());
867 passes.add(createCFGSimplificationPass());
868 passes.add(createEarlyCSEPass());
869 passes.add(createPromoteMemoryToRegisterPass());
870 passes.add(createCFGSimplificationPass());
871 passes.add(createEarlyCSEPass());
872 passes.add(createInstructionCombiningPass());
873 #if LLVM_VERSION_MAJOR <= 11
874 passes.add(createConstantPropagationPass());
875 #endif
876 passes.add(createSCCPPass());
877 passes.add(createAggressiveDCEPass());
878
879 passes.add(createLowerX86Pass(this));
880
881 passes.run(*blendFunc);
882
883 JitManager::DumpToFile(blendFunc, "optimized");
884
885 return blendFunc;
886 }
887 };
888
889 //////////////////////////////////////////////////////////////////////////
890 /// @brief JITs from fetch shader IR
891 /// @param hJitMgr - JitManager handle
892 /// @param func - LLVM function IR
893 /// @return PFN_FETCH_FUNC - pointer to fetch code
JitBlendFunc(HANDLE hJitMgr,const HANDLE hFunc)894 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
895 {
896 const llvm::Function* func = (const llvm::Function*)hFunc;
897 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
898 PFN_BLEND_JIT_FUNC pfnBlend;
899 pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
900 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
901 // add new IR to the module
902 pJitMgr->mIsModuleFinalized = true;
903
904 return pfnBlend;
905 }
906
907 //////////////////////////////////////////////////////////////////////////
908 /// @brief JIT compiles blend shader
909 /// @param hJitMgr - JitManager handle
910 /// @param state - blend state to build function from
JitCompileBlend(HANDLE hJitMgr,const BLEND_COMPILE_STATE & state)911 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr,
912 const BLEND_COMPILE_STATE& state)
913 {
914 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
915
916 pJitMgr->SetupNewModule();
917
918 BlendJit theJit(pJitMgr);
919 HANDLE hFunc = theJit.Create(state);
920
921 return JitBlendFunc(hJitMgr, hFunc);
922 }
923