1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file blend_jit.cpp
24  *
25  * @brief Implementation of the blend jitter
26  *
27  * Notes:
28  *
29  ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "jit_api.h"
33 #include "blend_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
36 
37 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
38 #define QUANTIZE_THRESHOLD 2
39 
40 using namespace llvm;
41 using namespace SwrJit;
42 
43 //////////////////////////////////////////////////////////////////////////
44 /// Interface to Jitting a blend shader
45 //////////////////////////////////////////////////////////////////////////
46 struct BlendJit : public Builder
47 {
BlendJitBlendJit48     BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
49 
50     template <bool Color, bool Alpha>
GenerateBlendFactorBlendJit51     void GenerateBlendFactor(SWR_BLEND_FACTOR factor,
52                              Value*           constColor[4],
53                              Value*           src[4],
54                              Value*           src1[4],
55                              Value*           dst[4],
56                              Value*           result[4])
57     {
58         Value* out[4];
59 
60         switch (factor)
61         {
62         case BLENDFACTOR_ONE:
63             out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
64             break;
65         case BLENDFACTOR_SRC_COLOR:
66             out[0] = src[0];
67             out[1] = src[1];
68             out[2] = src[2];
69             out[3] = src[3];
70             break;
71         case BLENDFACTOR_SRC_ALPHA:
72             out[0] = out[1] = out[2] = out[3] = src[3];
73             break;
74         case BLENDFACTOR_DST_ALPHA:
75             out[0] = out[1] = out[2] = out[3] = dst[3];
76             break;
77         case BLENDFACTOR_DST_COLOR:
78             out[0] = dst[0];
79             out[1] = dst[1];
80             out[2] = dst[2];
81             out[3] = dst[3];
82             break;
83         case BLENDFACTOR_SRC_ALPHA_SATURATE:
84             out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
85             out[3]                   = VIMMED1(1.0f);
86             break;
87         case BLENDFACTOR_CONST_COLOR:
88             out[0] = constColor[0];
89             out[1] = constColor[1];
90             out[2] = constColor[2];
91             out[3] = constColor[3];
92             break;
93         case BLENDFACTOR_CONST_ALPHA:
94             out[0] = out[1] = out[2] = out[3] = constColor[3];
95             break;
96         case BLENDFACTOR_SRC1_COLOR:
97             out[0] = src1[0];
98             out[1] = src1[1];
99             out[2] = src1[2];
100             out[3] = src1[3];
101             break;
102         case BLENDFACTOR_SRC1_ALPHA:
103             out[0] = out[1] = out[2] = out[3] = src1[3];
104             break;
105         case BLENDFACTOR_ZERO:
106             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
107             break;
108         case BLENDFACTOR_INV_SRC_COLOR:
109             out[0] = FSUB(VIMMED1(1.0f), src[0]);
110             out[1] = FSUB(VIMMED1(1.0f), src[1]);
111             out[2] = FSUB(VIMMED1(1.0f), src[2]);
112             out[3] = FSUB(VIMMED1(1.0f), src[3]);
113             break;
114         case BLENDFACTOR_INV_SRC_ALPHA:
115             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
116             break;
117         case BLENDFACTOR_INV_DST_ALPHA:
118             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
119             break;
120         case BLENDFACTOR_INV_DST_COLOR:
121             out[0] = FSUB(VIMMED1(1.0f), dst[0]);
122             out[1] = FSUB(VIMMED1(1.0f), dst[1]);
123             out[2] = FSUB(VIMMED1(1.0f), dst[2]);
124             out[3] = FSUB(VIMMED1(1.0f), dst[3]);
125             break;
126         case BLENDFACTOR_INV_CONST_COLOR:
127             out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
128             out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
129             out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
130             out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
131             break;
132         case BLENDFACTOR_INV_CONST_ALPHA:
133             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
134             break;
135         case BLENDFACTOR_INV_SRC1_COLOR:
136             out[0] = FSUB(VIMMED1(1.0f), src1[0]);
137             out[1] = FSUB(VIMMED1(1.0f), src1[1]);
138             out[2] = FSUB(VIMMED1(1.0f), src1[2]);
139             out[3] = FSUB(VIMMED1(1.0f), src1[3]);
140             break;
141         case BLENDFACTOR_INV_SRC1_ALPHA:
142             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
143             break;
144         default:
145             SWR_INVALID("Unsupported blend factor: %d", factor);
146             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
147             break;
148         }
149 
150         if (Color)
151         {
152             result[0] = out[0];
153             result[1] = out[1];
154             result[2] = out[2];
155         }
156 
157         if (Alpha)
158         {
159             result[3] = out[3];
160         }
161     }
162 
ClampBlendJit163     void Clamp(SWR_FORMAT format, Value* src[4])
164     {
165         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
166         SWR_TYPE               type = info.type[0];
167 
168         switch (type)
169         {
170         default:
171             break;
172 
173         case SWR_TYPE_UNORM:
174             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
175             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
176             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
177             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
178             break;
179 
180         case SWR_TYPE_SNORM:
181             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
182             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
183             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
184             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
185             break;
186 
187         case SWR_TYPE_UNKNOWN:
188             SWR_INVALID("Unsupport format type: %d", type);
189         }
190     }
191 
ApplyDefaultsBlendJit192     void ApplyDefaults(SWR_FORMAT format, Value* src[4])
193     {
194         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
195 
196         bool valid[] = {false, false, false, false};
197         for (uint32_t c = 0; c < info.numComps; ++c)
198         {
199             valid[info.swizzle[c]] = true;
200         }
201 
202         for (uint32_t c = 0; c < 4; ++c)
203         {
204             if (!valid[c])
205             {
206                 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
207             }
208         }
209     }
210 
ApplyUnusedDefaultsBlendJit211     void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
212     {
213         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
214 
215         for (uint32_t c = 0; c < info.numComps; ++c)
216         {
217             if (info.type[c] == SWR_TYPE_UNUSED)
218             {
219                 src[info.swizzle[c]] =
220                     BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
221             }
222         }
223     }
224 
QuantizeBlendJit225     void Quantize(SWR_FORMAT format, Value* src[4])
226     {
227         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
228         for (uint32_t c = 0; c < info.numComps; ++c)
229         {
230             if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
231             {
232                 uint32_t swizComp = info.swizzle[c];
233                 float    factor   = (float)((1 << info.bpc[c]) - 1);
234                 switch (info.type[c])
235                 {
236                 case SWR_TYPE_UNORM:
237                     src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
238                     src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
239                     src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor));
240                     break;
241                 default:
242                     SWR_INVALID("Unsupported format type: %d", info.type[c]);
243                 }
244             }
245         }
246     }
247 
248     template <bool Color, bool Alpha>
BlendFuncBlendJit249     void BlendFunc(SWR_BLEND_OP blendOp,
250                    Value*       src[4],
251                    Value*       srcFactor[4],
252                    Value*       dst[4],
253                    Value*       dstFactor[4],
254                    Value*       result[4])
255     {
256         Value* out[4];
257         Value* srcBlend[4];
258         Value* dstBlend[4];
259         for (uint32_t i = 0; i < 4; ++i)
260         {
261             srcBlend[i] = FMUL(src[i], srcFactor[i]);
262             dstBlend[i] = FMUL(dst[i], dstFactor[i]);
263         }
264 
265         switch (blendOp)
266         {
267         case BLENDOP_ADD:
268             out[0] = FADD(srcBlend[0], dstBlend[0]);
269             out[1] = FADD(srcBlend[1], dstBlend[1]);
270             out[2] = FADD(srcBlend[2], dstBlend[2]);
271             out[3] = FADD(srcBlend[3], dstBlend[3]);
272             break;
273 
274         case BLENDOP_SUBTRACT:
275             out[0] = FSUB(srcBlend[0], dstBlend[0]);
276             out[1] = FSUB(srcBlend[1], dstBlend[1]);
277             out[2] = FSUB(srcBlend[2], dstBlend[2]);
278             out[3] = FSUB(srcBlend[3], dstBlend[3]);
279             break;
280 
281         case BLENDOP_REVSUBTRACT:
282             out[0] = FSUB(dstBlend[0], srcBlend[0]);
283             out[1] = FSUB(dstBlend[1], srcBlend[1]);
284             out[2] = FSUB(dstBlend[2], srcBlend[2]);
285             out[3] = FSUB(dstBlend[3], srcBlend[3]);
286             break;
287 
288         case BLENDOP_MIN:
289             out[0] = VMINPS(src[0], dst[0]);
290             out[1] = VMINPS(src[1], dst[1]);
291             out[2] = VMINPS(src[2], dst[2]);
292             out[3] = VMINPS(src[3], dst[3]);
293             break;
294 
295         case BLENDOP_MAX:
296             out[0] = VMAXPS(src[0], dst[0]);
297             out[1] = VMAXPS(src[1], dst[1]);
298             out[2] = VMAXPS(src[2], dst[2]);
299             out[3] = VMAXPS(src[3], dst[3]);
300             break;
301 
302         default:
303             SWR_INVALID("Unsupported blend operation: %d", blendOp);
304             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
305             break;
306         }
307 
308         if (Color)
309         {
310             result[0] = out[0];
311             result[1] = out[1];
312             result[2] = out[2];
313         }
314 
315         if (Alpha)
316         {
317             result[3] = out[3];
318         }
319     }
320 
LogicOpFuncBlendJit321     void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
322     {
323         // Op: (s == PS output, d = RT contents)
324         switch (logicOp)
325         {
326         case LOGICOP_CLEAR:
327             result[0] = VIMMED1(0);
328             result[1] = VIMMED1(0);
329             result[2] = VIMMED1(0);
330             result[3] = VIMMED1(0);
331             break;
332 
333         case LOGICOP_NOR:
334             // ~(s | d)
335             result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
336             result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
337             result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
338             result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
339             break;
340 
341         case LOGICOP_AND_INVERTED:
342             // ~s & d
343             // todo: use avx andnot instr when I can find the intrinsic to call
344             result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
345             result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
346             result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
347             result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
348             break;
349 
350         case LOGICOP_COPY_INVERTED:
351             // ~s
352             result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
353             result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
354             result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
355             result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
356             break;
357 
358         case LOGICOP_AND_REVERSE:
359             // s & ~d
360             // todo: use avx andnot instr when I can find the intrinsic to call
361             result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
362             result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
363             result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
364             result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
365             break;
366 
367         case LOGICOP_INVERT:
368             // ~d
369             result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
370             result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
371             result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
372             result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
373             break;
374 
375         case LOGICOP_XOR:
376             // s ^ d
377             result[0] = XOR(src[0], dst[0]);
378             result[1] = XOR(src[1], dst[1]);
379             result[2] = XOR(src[2], dst[2]);
380             result[3] = XOR(src[3], dst[3]);
381             break;
382 
383         case LOGICOP_NAND:
384             // ~(s & d)
385             result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
386             result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
387             result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
388             result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
389             break;
390 
391         case LOGICOP_AND:
392             // s & d
393             result[0] = AND(src[0], dst[0]);
394             result[1] = AND(src[1], dst[1]);
395             result[2] = AND(src[2], dst[2]);
396             result[3] = AND(src[3], dst[3]);
397             break;
398 
399         case LOGICOP_EQUIV:
400             // ~(s ^ d)
401             result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
402             result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
403             result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
404             result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
405             break;
406 
407         case LOGICOP_NOOP:
408             result[0] = dst[0];
409             result[1] = dst[1];
410             result[2] = dst[2];
411             result[3] = dst[3];
412             break;
413 
414         case LOGICOP_OR_INVERTED:
415             // ~s | d
416             result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
417             result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
418             result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
419             result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
420             break;
421 
422         case LOGICOP_COPY:
423             result[0] = src[0];
424             result[1] = src[1];
425             result[2] = src[2];
426             result[3] = src[3];
427             break;
428 
429         case LOGICOP_OR_REVERSE:
430             // s | ~d
431             result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
432             result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
433             result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
434             result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
435             break;
436 
437         case LOGICOP_OR:
438             // s | d
439             result[0] = OR(src[0], dst[0]);
440             result[1] = OR(src[1], dst[1]);
441             result[2] = OR(src[2], dst[2]);
442             result[3] = OR(src[3], dst[3]);
443             break;
444 
445         case LOGICOP_SET:
446             result[0] = VIMMED1(0xFFFFFFFF);
447             result[1] = VIMMED1(0xFFFFFFFF);
448             result[2] = VIMMED1(0xFFFFFFFF);
449             result[3] = VIMMED1(0xFFFFFFFF);
450             break;
451 
452         default:
453             SWR_INVALID("Unsupported logic operation: %d", logicOp);
454             result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
455             break;
456         }
457     }
458 
459     void
AlphaTestBlendJit460     AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
461     {
462         // load uint32_t reference
463         Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference}));
464 
465         // load alpha
466         Value* pAlpha = LOAD(ppAlpha, {0, 0});
467 
468         Value* pTest = nullptr;
469         if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
470         {
471             // convert float alpha to unorm8
472             Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
473             pAlphaU8        = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
474 
475             // compare
476             switch (state.alphaTestFunction)
477             {
478             case ZFUNC_ALWAYS:
479                 pTest = VIMMED1(true);
480                 break;
481             case ZFUNC_NEVER:
482                 pTest = VIMMED1(false);
483                 break;
484             case ZFUNC_LT:
485                 pTest = ICMP_ULT(pAlphaU8, pRef);
486                 break;
487             case ZFUNC_EQ:
488                 pTest = ICMP_EQ(pAlphaU8, pRef);
489                 break;
490             case ZFUNC_LE:
491                 pTest = ICMP_ULE(pAlphaU8, pRef);
492                 break;
493             case ZFUNC_GT:
494                 pTest = ICMP_UGT(pAlphaU8, pRef);
495                 break;
496             case ZFUNC_NE:
497                 pTest = ICMP_NE(pAlphaU8, pRef);
498                 break;
499             case ZFUNC_GE:
500                 pTest = ICMP_UGE(pAlphaU8, pRef);
501                 break;
502             default:
503                 SWR_INVALID("Invalid alpha test function");
504                 break;
505             }
506         }
507         else
508         {
509             // cast ref to float
510             pRef = BITCAST(pRef, mSimdFP32Ty);
511 
512             // compare
513             switch (state.alphaTestFunction)
514             {
515             case ZFUNC_ALWAYS:
516                 pTest = VIMMED1(true);
517                 break;
518             case ZFUNC_NEVER:
519                 pTest = VIMMED1(false);
520                 break;
521             case ZFUNC_LT:
522                 pTest = FCMP_OLT(pAlpha, pRef);
523                 break;
524             case ZFUNC_EQ:
525                 pTest = FCMP_OEQ(pAlpha, pRef);
526                 break;
527             case ZFUNC_LE:
528                 pTest = FCMP_OLE(pAlpha, pRef);
529                 break;
530             case ZFUNC_GT:
531                 pTest = FCMP_OGT(pAlpha, pRef);
532                 break;
533             case ZFUNC_NE:
534                 pTest = FCMP_ONE(pAlpha, pRef);
535                 break;
536             case ZFUNC_GE:
537                 pTest = FCMP_OGE(pAlpha, pRef);
538                 break;
539             default:
540                 SWR_INVALID("Invalid alpha test function");
541                 break;
542             }
543         }
544 
545         // load current mask
546         Value* pMask = LOAD(ppMask);
547 
548         // convert to int1 mask
549         pMask = MASK(pMask);
550 
551         // and with alpha test result
552         pMask = AND(pMask, pTest);
553 
554         // convert back to vector mask
555         pMask = VMASK(pMask);
556 
557         // store new mask
558         STORE(pMask, ppMask);
559     }
560 
CreateBlendJit561     Function* Create(const BLEND_COMPILE_STATE& state)
562     {
563         std::stringstream fnName("BLND_",
564                                  std::ios_base::in | std::ios_base::out | std::ios_base::ate);
565         fnName << ComputeCRC(0, &state, sizeof(state));
566 
567         // blend function signature
568         // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*);
569 
570         std::vector<Type*> args{
571             PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT*
572         };
573 
574         // std::vector<Type*> args{
575         //    PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT*
576         //};
577 
578         FunctionType* fTy       = FunctionType::get(IRB()->getVoidTy(), args, false);
579         Function*     blendFunc = Function::Create(
580             fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
581         blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
582 
583         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
584 
585         IRB()->SetInsertPoint(entry);
586 
587         // arguments
588         auto   argitr        = blendFunc->arg_begin();
589         Value* pBlendContext = &*argitr++;
590         pBlendContext->setName("pBlendContext");
591         Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState});
592         pBlendState->setName("pBlendState");
593         Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src});
594         pSrc->setName("src");
595         Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1});
596         pSrc1->setName("src1");
597         Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha});
598         pSrc0Alpha->setName("src0alpha");
599         Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum});
600         sampleNum->setName("sampleNum");
601         Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst});
602         pDst->setName("pDst");
603         Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result});
604         pResult->setName("result");
605         Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask});
606         ppoMask->setName("ppoMask");
607         Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask});
608         ppMask->setName("pMask");
609 
610         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
611                       "Unsupported hot tile format");
612         Value* dst[4];
613         Value* constantColor[4];
614         Value* src[4];
615         Value* src1[4];
616         Value* result[4];
617         for (uint32_t i = 0; i < 4; ++i)
618         {
619             // load hot tile
620             dst[i] = LOAD(pDst, {0, i});
621 
622             // load constant color
623             constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i}));
624 
625             // load src
626             src[i] = LOAD(pSrc, {0, i});
627 
628             // load src1
629             src1[i] = LOAD(pSrc1, {0, i});
630         }
631         Value* currentSampleMask = VIMMED1(-1);
632         if (state.desc.alphaToCoverageEnable)
633         {
634             Value*   pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
635             uint32_t bits        = (1 << state.desc.numSamples) - 1;
636             currentSampleMask    = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
637             currentSampleMask    = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
638         }
639 
640         // alpha test
641         if (state.desc.alphaTestEnable)
642         {
643             // Gather for archrast stats
644             STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
645             AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
646         }
647         else
648         {
649             // Gather for archrast stats
650             STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
651         }
652 
653         // color blend
654         if (state.blendState.blendEnable)
655         {
656             // Gather for archrast stats
657             STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
658 
659             // clamp sources
660             Clamp(state.format, src);
661             Clamp(state.format, src1);
662             Clamp(state.format, dst);
663             Clamp(state.format, constantColor);
664 
665             // apply defaults to hottile contents to take into account missing components
666             ApplyDefaults(state.format, dst);
667 
668             // Force defaults for unused 'X' components
669             ApplyUnusedDefaults(state.format, dst);
670 
671             // Quantize low precision components
672             Quantize(state.format, dst);
673 
674             // special case clamping for R11G11B10_float which has no sign bit
675             if (state.format == R11G11B10_FLOAT)
676             {
677                 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
678                 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
679                 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
680                 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
681             }
682 
683             Value* srcFactor[4];
684             Value* dstFactor[4];
685             if (state.desc.independentAlphaBlendEnable)
686             {
687                 GenerateBlendFactor<true, false>(
688                     state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
689                 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor,
690                                                  constantColor,
691                                                  src,
692                                                  src1,
693                                                  dst,
694                                                  srcFactor);
695 
696                 GenerateBlendFactor<true, false>(
697                     state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
698                 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor,
699                                                  constantColor,
700                                                  src,
701                                                  src1,
702                                                  dst,
703                                                  dstFactor);
704 
705                 BlendFunc<true, false>(
706                     state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
707                 BlendFunc<false, true>(
708                     state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
709             }
710             else
711             {
712                 GenerateBlendFactor<true, true>(
713                     state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
714                 GenerateBlendFactor<true, true>(
715                     state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
716 
717                 BlendFunc<true, true>(
718                     state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
719             }
720 
721             // store results out
722             for (uint32_t i = 0; i < 4; ++i)
723             {
724                 STORE(result[i], pResult, {0, i});
725             }
726         }
727         else
728         {
729             // Gather for archrast stats
730             STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
731         }
732 
733         if (state.blendState.logicOpEnable)
734         {
735             const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
736             Value*                 vMask[4];
737             float                  scale[4];
738 
739             if (!state.blendState.blendEnable)
740             {
741                 Clamp(state.format, src);
742                 Clamp(state.format, dst);
743             }
744 
745             for (uint32_t i = 0; i < 4; i++)
746             {
747                 if (info.type[i] == SWR_TYPE_UNUSED)
748                 {
749                     continue;
750                 }
751 
752                 if (info.bpc[i] >= 32)
753                 {
754                     vMask[i] = VIMMED1(0xFFFFFFFF);
755                     scale[i] = 0xFFFFFFFF;
756                 }
757                 else
758                 {
759                     vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
760                     if (info.type[i] == SWR_TYPE_SNORM)
761                         scale[i] = (1 << (info.bpc[i] - 1)) - 1;
762                     else
763                         scale[i] = (1 << info.bpc[i]) - 1;
764                 }
765 
766                 switch (info.type[i])
767                 {
768                 default:
769                     SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
770                     break;
771 
772                 case SWR_TYPE_UNKNOWN:
773                 case SWR_TYPE_UNUSED:
774                     // fallthrough
775 
776                 case SWR_TYPE_UINT:
777                 case SWR_TYPE_SINT:
778                     src[i] = BITCAST(src[i], mSimdInt32Ty);
779                     dst[i] = BITCAST(dst[i], mSimdInt32Ty);
780                     break;
781                 case SWR_TYPE_SNORM:
782                     src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
783                     dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
784                     break;
785                 case SWR_TYPE_UNORM:
786                     src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
787                     dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
788                     break;
789                 }
790             }
791 
792             LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
793 
794             // store results out
795             for (uint32_t i = 0; i < 4; ++i)
796             {
797                 if (info.type[i] == SWR_TYPE_UNUSED)
798                 {
799                     continue;
800                 }
801 
802                 // clear upper bits from PS output not in RT format after doing logic op
803                 result[i] = AND(result[i], vMask[i]);
804 
805                 switch (info.type[i])
806                 {
807                 default:
808                     SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
809                     break;
810 
811                 case SWR_TYPE_UNKNOWN:
812                 case SWR_TYPE_UNUSED:
813                     // fallthrough
814 
815                 case SWR_TYPE_UINT:
816                 case SWR_TYPE_SINT:
817                     result[i] = BITCAST(result[i], mSimdFP32Ty);
818                     break;
819                 case SWR_TYPE_SNORM:
820                     result[i] = SHL(result[i], C(32 - info.bpc[i]));
821                     result[i] = ASHR(result[i], C(32 - info.bpc[i]));
822                     result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
823                     break;
824                 case SWR_TYPE_UNORM:
825                     result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
826                     break;
827                 }
828 
829                 STORE(result[i], pResult, {0, i});
830             }
831         }
832 
833         if (state.desc.oMaskEnable)
834         {
835             assert(!(state.desc.alphaToCoverageEnable));
836             // load current mask
837             Value* oMask      = LOAD(ppoMask);
838             currentSampleMask = AND(oMask, currentSampleMask);
839         }
840 
841         if (state.desc.sampleMaskEnable)
842         {
843             Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask});
844             currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
845         }
846 
847         if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
848             state.desc.oMaskEnable)
849         {
850             // load coverage mask and mask off any lanes with no samples
851             Value* pMask        = LOAD(ppMask);
852             Value* sampleMasked = SHL(C(1), sampleNum);
853             currentSampleMask   = AND(currentSampleMask, VBROADCAST(sampleMasked));
854             currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
855             Value* outputMask = AND(pMask, currentSampleMask);
856             // store new mask
857             STORE(outputMask, GEP(ppMask, C(0)));
858         }
859 
860         RET_VOID();
861 
862         JitManager::DumpToFile(blendFunc, "");
863 
864         ::FunctionPassManager passes(JM()->mpCurrentModule);
865 
866         passes.add(createBreakCriticalEdgesPass());
867         passes.add(createCFGSimplificationPass());
868         passes.add(createEarlyCSEPass());
869         passes.add(createPromoteMemoryToRegisterPass());
870         passes.add(createCFGSimplificationPass());
871         passes.add(createEarlyCSEPass());
872         passes.add(createInstructionCombiningPass());
873 #if LLVM_VERSION_MAJOR <= 11
874         passes.add(createConstantPropagationPass());
875 #endif
876         passes.add(createSCCPPass());
877         passes.add(createAggressiveDCEPass());
878 
879         passes.add(createLowerX86Pass(this));
880 
881         passes.run(*blendFunc);
882 
883         JitManager::DumpToFile(blendFunc, "optimized");
884 
885         return blendFunc;
886     }
887 };
888 
889 //////////////////////////////////////////////////////////////////////////
890 /// @brief JITs from fetch shader IR
891 /// @param hJitMgr - JitManager handle
892 /// @param func   - LLVM function IR
893 /// @return PFN_FETCH_FUNC - pointer to fetch code
JitBlendFunc(HANDLE hJitMgr,const HANDLE hFunc)894 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
895 {
896     const llvm::Function* func    = (const llvm::Function*)hFunc;
897     JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
898     PFN_BLEND_JIT_FUNC    pfnBlend;
899     pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
900     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
901     // add new IR to the module
902     pJitMgr->mIsModuleFinalized = true;
903 
904     return pfnBlend;
905 }
906 
907 //////////////////////////////////////////////////////////////////////////
908 /// @brief JIT compiles blend shader
909 /// @param hJitMgr - JitManager handle
910 /// @param state   - blend state to build function from
JitCompileBlend(HANDLE hJitMgr,const BLEND_COMPILE_STATE & state)911 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE                     hJitMgr,
912                                                       const BLEND_COMPILE_STATE& state)
913 {
914     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
915 
916     pJitMgr->SetupNewModule();
917 
918     BlendJit theJit(pJitMgr);
919     HANDLE   hFunc = theJit.Create(state);
920 
921     return JitBlendFunc(hJitMgr, hFunc);
922 }
923