1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file blend_jit.cpp
24 *
25 * @brief Implementation of the blend jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "jit_api.h"
33 #include "blend_jit.h"
34 #include "gen_state_llvm.h"
35 
36 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
37 #define QUANTIZE_THRESHOLD 2
38 
39 using namespace llvm;
40 using namespace SwrJit;
41 
42 //////////////////////////////////////////////////////////////////////////
43 /// Interface to Jitting a blend shader
44 //////////////////////////////////////////////////////////////////////////
45 struct BlendJit : public Builder
46 {
BlendJitBlendJit47     BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
48 
49     template<bool Color, bool Alpha>
GenerateBlendFactorBlendJit50     void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
51     {
52         Value* out[4];
53 
54         switch (factor)
55         {
56         case BLENDFACTOR_ONE:
57             out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
58             break;
59         case BLENDFACTOR_SRC_COLOR:
60             out[0] = src[0];
61             out[1] = src[1];
62             out[2] = src[2];
63             out[3] = src[3];
64             break;
65         case BLENDFACTOR_SRC_ALPHA:
66             out[0] = out[1] = out[2] = out[3] = src[3];
67             break;
68         case BLENDFACTOR_DST_ALPHA:
69             out[0] = out[1] = out[2] = out[3] = dst[3];
70             break;
71         case BLENDFACTOR_DST_COLOR:
72             out[0] = dst[0];
73             out[1] = dst[1];
74             out[2] = dst[2];
75             out[3] = dst[3];
76             break;
77         case BLENDFACTOR_SRC_ALPHA_SATURATE:
78             out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
79             out[3] = VIMMED1(1.0f);
80             break;
81         case BLENDFACTOR_CONST_COLOR:
82             out[0] = constColor[0];
83             out[1] = constColor[1];
84             out[2] = constColor[2];
85             out[3] = constColor[3];
86             break;
87         case BLENDFACTOR_CONST_ALPHA:
88             out[0] = out[1] = out[2] = out[3] = constColor[3];
89             break;
90         case BLENDFACTOR_SRC1_COLOR:
91             out[0] = src1[0];
92             out[1] = src1[1];
93             out[2] = src1[2];
94             out[3] = src1[3];
95             break;
96         case BLENDFACTOR_SRC1_ALPHA:
97             out[0] = out[1] = out[2] = out[3] = src1[3];
98             break;
99         case BLENDFACTOR_ZERO:
100             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
101             break;
102         case BLENDFACTOR_INV_SRC_COLOR:
103             out[0] = FSUB(VIMMED1(1.0f), src[0]);
104             out[1] = FSUB(VIMMED1(1.0f), src[1]);
105             out[2] = FSUB(VIMMED1(1.0f), src[2]);
106             out[3] = FSUB(VIMMED1(1.0f), src[3]);
107             break;
108         case BLENDFACTOR_INV_SRC_ALPHA:
109             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
110             break;
111         case BLENDFACTOR_INV_DST_ALPHA:
112             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
113             break;
114         case BLENDFACTOR_INV_DST_COLOR:
115             out[0] = FSUB(VIMMED1(1.0f), dst[0]);
116             out[1] = FSUB(VIMMED1(1.0f), dst[1]);
117             out[2] = FSUB(VIMMED1(1.0f), dst[2]);
118             out[3] = FSUB(VIMMED1(1.0f), dst[3]);
119             break;
120         case BLENDFACTOR_INV_CONST_COLOR:
121             out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
122             out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
123             out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
124             out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
125             break;
126         case BLENDFACTOR_INV_CONST_ALPHA:
127             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
128             break;
129         case BLENDFACTOR_INV_SRC1_COLOR:
130             out[0] = FSUB(VIMMED1(1.0f), src1[0]);
131             out[1] = FSUB(VIMMED1(1.0f), src1[1]);
132             out[2] = FSUB(VIMMED1(1.0f), src1[2]);
133             out[3] = FSUB(VIMMED1(1.0f), src1[3]);
134             break;
135         case BLENDFACTOR_INV_SRC1_ALPHA:
136             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
137             break;
138         default:
139             SWR_INVALID("Unsupported blend factor: %d", factor);
140             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
141             break;
142         }
143 
144         if (Color)
145         {
146             result[0] = out[0];
147             result[1] = out[1];
148             result[2] = out[2];
149         }
150 
151         if (Alpha)
152         {
153             result[3] = out[3];
154         }
155     }
156 
ClampBlendJit157     void Clamp(SWR_FORMAT format, Value* src[4])
158     {
159         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
160         SWR_TYPE type = info.type[0];
161 
162         switch (type)
163         {
164         default:
165             break;
166 
167         case SWR_TYPE_UNORM:
168             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
169             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
170             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
171             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
172             break;
173 
174         case SWR_TYPE_SNORM:
175             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
176             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
177             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
178             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
179             break;
180 
181         case SWR_TYPE_UNKNOWN: SWR_INVALID("Unsupport format type: %d", type);
182         }
183     }
184 
ApplyDefaultsBlendJit185     void ApplyDefaults(SWR_FORMAT format, Value* src[4])
186     {
187         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
188 
189         bool valid[] = { false, false, false, false };
190         for (uint32_t c = 0; c < info.numComps; ++c)
191         {
192             valid[info.swizzle[c]] = true;
193         }
194 
195         for (uint32_t c = 0; c < 4; ++c)
196         {
197             if (!valid[c])
198             {
199                 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
200             }
201         }
202     }
203 
ApplyUnusedDefaultsBlendJit204     void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
205     {
206         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
207 
208         for (uint32_t c = 0; c < info.numComps; ++c)
209         {
210             if (info.type[c] == SWR_TYPE_UNUSED)
211             {
212                 src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
213             }
214         }
215     }
216 
QuantizeBlendJit217     void Quantize(SWR_FORMAT format, Value* src[4])
218     {
219         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
220         for (uint32_t c = 0; c < info.numComps; ++c)
221         {
222             if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
223             {
224                 uint32_t swizComp = info.swizzle[c];
225                 float factor = (float)((1 << info.bpc[c]) - 1);
226                 switch (info.type[c])
227                 {
228                 case SWR_TYPE_UNORM:
229                     src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
230                     src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
231                     src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
232                     break;
233                 default: SWR_INVALID("Unsupported format type: %d", info.type[c]);
234                 }
235             }
236         }
237     }
238 
239     template<bool Color, bool Alpha>
BlendFuncBlendJit240     void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
241     {
242         Value* out[4];
243         Value* srcBlend[4];
244         Value* dstBlend[4];
245         for (uint32_t i = 0; i < 4; ++i)
246         {
247             srcBlend[i] = FMUL(src[i], srcFactor[i]);
248             dstBlend[i] = FMUL(dst[i], dstFactor[i]);
249         }
250 
251         switch (blendOp)
252         {
253         case BLENDOP_ADD:
254             out[0] = FADD(srcBlend[0], dstBlend[0]);
255             out[1] = FADD(srcBlend[1], dstBlend[1]);
256             out[2] = FADD(srcBlend[2], dstBlend[2]);
257             out[3] = FADD(srcBlend[3], dstBlend[3]);
258             break;
259 
260         case BLENDOP_SUBTRACT:
261             out[0] = FSUB(srcBlend[0], dstBlend[0]);
262             out[1] = FSUB(srcBlend[1], dstBlend[1]);
263             out[2] = FSUB(srcBlend[2], dstBlend[2]);
264             out[3] = FSUB(srcBlend[3], dstBlend[3]);
265             break;
266 
267         case BLENDOP_REVSUBTRACT:
268             out[0] = FSUB(dstBlend[0], srcBlend[0]);
269             out[1] = FSUB(dstBlend[1], srcBlend[1]);
270             out[2] = FSUB(dstBlend[2], srcBlend[2]);
271             out[3] = FSUB(dstBlend[3], srcBlend[3]);
272             break;
273 
274         case BLENDOP_MIN:
275             out[0] = VMINPS(src[0], dst[0]);
276             out[1] = VMINPS(src[1], dst[1]);
277             out[2] = VMINPS(src[2], dst[2]);
278             out[3] = VMINPS(src[3], dst[3]);
279             break;
280 
281         case BLENDOP_MAX:
282             out[0] = VMAXPS(src[0], dst[0]);
283             out[1] = VMAXPS(src[1], dst[1]);
284             out[2] = VMAXPS(src[2], dst[2]);
285             out[3] = VMAXPS(src[3], dst[3]);
286             break;
287 
288         default:
289             SWR_INVALID("Unsupported blend operation: %d", blendOp);
290             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
291             break;
292         }
293 
294         if (Color)
295         {
296             result[0] = out[0];
297             result[1] = out[1];
298             result[2] = out[2];
299         }
300 
301         if (Alpha)
302         {
303             result[3] = out[3];
304         }
305     }
306 
LogicOpFuncBlendJit307     void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
308     {
309         // Op: (s == PS output, d = RT contents)
310         switch(logicOp)
311         {
312         case LOGICOP_CLEAR:
313             result[0] = VIMMED1(0);
314             result[1] = VIMMED1(0);
315             result[2] = VIMMED1(0);
316             result[3] = VIMMED1(0);
317             break;
318 
319         case LOGICOP_NOR:
320             // ~(s | d)
321             result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
322             result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
323             result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
324             result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
325             break;
326 
327         case LOGICOP_AND_INVERTED:
328             // ~s & d
329             // todo: use avx andnot instr when I can find the intrinsic to call
330             result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
331             result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
332             result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
333             result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
334             break;
335 
336         case LOGICOP_COPY_INVERTED:
337             // ~s
338             result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
339             result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
340             result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
341             result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
342             break;
343 
344         case LOGICOP_AND_REVERSE:
345             // s & ~d
346             // todo: use avx andnot instr when I can find the intrinsic to call
347             result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
348             result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
349             result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
350             result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
351             break;
352 
353         case LOGICOP_INVERT:
354             // ~d
355             result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
356             result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
357             result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
358             result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
359             break;
360 
361         case LOGICOP_XOR:
362             // s ^ d
363             result[0] = XOR(src[0], dst[0]);
364             result[1] = XOR(src[1], dst[1]);
365             result[2] = XOR(src[2], dst[2]);
366             result[3] = XOR(src[3], dst[3]);
367             break;
368 
369         case LOGICOP_NAND:
370             // ~(s & d)
371             result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
372             result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
373             result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
374             result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
375             break;
376 
377         case LOGICOP_AND:
378             // s & d
379             result[0] = AND(src[0], dst[0]);
380             result[1] = AND(src[1], dst[1]);
381             result[2] = AND(src[2], dst[2]);
382             result[3] = AND(src[3], dst[3]);
383             break;
384 
385         case LOGICOP_EQUIV:
386             // ~(s ^ d)
387             result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
388             result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
389             result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
390             result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
391             break;
392 
393         case LOGICOP_NOOP:
394             result[0] = dst[0];
395             result[1] = dst[1];
396             result[2] = dst[2];
397             result[3] = dst[3];
398             break;
399 
400         case LOGICOP_OR_INVERTED:
401             // ~s | d
402             result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
403             result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
404             result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
405             result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
406             break;
407 
408         case LOGICOP_COPY:
409             result[0] = src[0];
410             result[1] = src[1];
411             result[2] = src[2];
412             result[3] = src[3];
413             break;
414 
415         case LOGICOP_OR_REVERSE:
416             // s | ~d
417             result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
418             result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
419             result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
420             result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
421             break;
422 
423         case LOGICOP_OR:
424             // s | d
425             result[0] = OR(src[0], dst[0]);
426             result[1] = OR(src[1], dst[1]);
427             result[2] = OR(src[2], dst[2]);
428             result[3] = OR(src[3], dst[3]);
429             break;
430 
431         case LOGICOP_SET:
432             result[0] = VIMMED1(0xFFFFFFFF);
433             result[1] = VIMMED1(0xFFFFFFFF);
434             result[2] = VIMMED1(0xFFFFFFFF);
435             result[3] = VIMMED1(0xFFFFFFFF);
436             break;
437 
438         default:
439             SWR_INVALID("Unsupported logic operation: %d", logicOp);
440             result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
441             break;
442         }
443     }
444 
AlphaTestBlendJit445     void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
446     {
447         // load uint32_t reference
448         Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
449 
450         // load alpha
451         Value* pAlpha = LOAD(ppAlpha);
452 
453         Value* pTest = nullptr;
454         if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
455         {
456             // convert float alpha to unorm8
457             Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
458             pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
459 
460             // compare
461             switch (state.alphaTestFunction)
462             {
463             case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
464             case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
465             case ZFUNC_LT:      pTest = ICMP_ULT(pAlphaU8, pRef); break;
466             case ZFUNC_EQ:      pTest = ICMP_EQ(pAlphaU8, pRef); break;
467             case ZFUNC_LE:      pTest = ICMP_ULE(pAlphaU8, pRef); break;
468             case ZFUNC_GT:      pTest = ICMP_UGT(pAlphaU8, pRef); break;
469             case ZFUNC_NE:      pTest = ICMP_NE(pAlphaU8, pRef); break;
470             case ZFUNC_GE:      pTest = ICMP_UGE(pAlphaU8, pRef); break;
471             default:
472                 SWR_INVALID("Invalid alpha test function");
473                 break;
474             }
475         }
476         else
477         {
478             // cast ref to float
479             pRef = BITCAST(pRef, mSimdFP32Ty);
480 
481             // compare
482             switch (state.alphaTestFunction)
483             {
484             case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
485             case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
486             case ZFUNC_LT:      pTest = FCMP_OLT(pAlpha, pRef); break;
487             case ZFUNC_EQ:      pTest = FCMP_OEQ(pAlpha, pRef); break;
488             case ZFUNC_LE:      pTest = FCMP_OLE(pAlpha, pRef); break;
489             case ZFUNC_GT:      pTest = FCMP_OGT(pAlpha, pRef); break;
490             case ZFUNC_NE:      pTest = FCMP_ONE(pAlpha, pRef); break;
491             case ZFUNC_GE:      pTest = FCMP_OGE(pAlpha, pRef); break;
492             default:
493                 SWR_INVALID("Invalid alpha test function");
494                 break;
495             }
496         }
497 
498         // load current mask
499         Value* pMask = LOAD(ppMask);
500 
501         // convert to int1 mask
502         pMask = MASK(pMask);
503 
504         // and with alpha test result
505         pMask = AND(pMask, pTest);
506 
507         // convert back to vector mask
508         pMask = VMASK(pMask);
509 
510         // store new mask
511         STORE(pMask, ppMask);
512     }
513 
CreateBlendJit514     Function* Create(const BLEND_COMPILE_STATE& state)
515     {
516         std::stringstream fnName("BLND_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
517         fnName << ComputeCRC(0, &state, sizeof(state));
518 
519         // blend function signature
520         //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
521 
522         std::vector<Type*> args{
523             PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
524             PointerType::get(mSimdFP32Ty, 0),               // simdvector& src
525             PointerType::get(mSimdFP32Ty, 0),               // simdvector& src1
526             PointerType::get(mSimdFP32Ty, 0),               // src0alpha
527             Type::getInt32Ty(JM()->mContext),               // sampleNum
528             PointerType::get(mSimdFP32Ty, 0),               // uint8_t* pDst
529             PointerType::get(mSimdFP32Ty, 0),               // simdvector& result
530             PointerType::get(mSimdInt32Ty, 0),              // simdscalari* oMask
531             PointerType::get(mSimdInt32Ty, 0),              // simdscalari* pMask
532         };
533 
534         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
535         Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
536         blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
537 
538         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
539 
540         IRB()->SetInsertPoint(entry);
541 
542         // arguments
543         auto argitr = blendFunc->arg_begin();
544         Value* pBlendState = &*argitr++;
545         pBlendState->setName("pBlendState");
546         Value* pSrc = &*argitr++;
547         pSrc->setName("src");
548         Value* pSrc1 = &*argitr++;
549         pSrc1->setName("src1");
550         Value* pSrc0Alpha = &*argitr++;
551         pSrc0Alpha->setName("src0alpha");
552         Value* sampleNum = &*argitr++;
553         sampleNum->setName("sampleNum");
554         Value* pDst = &*argitr++;
555         pDst->setName("pDst");
556         Value* pResult = &*argitr++;
557         pResult->setName("result");
558         Value* ppoMask = &*argitr++;
559         ppoMask->setName("ppoMask");
560         Value* ppMask = &*argitr++;
561         ppMask->setName("pMask");
562 
563         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
564         Value* dst[4];
565         Value* constantColor[4];
566         Value* src[4];
567         Value* src1[4];
568         Value* result[4];
569         for (uint32_t i = 0; i < 4; ++i)
570         {
571             // load hot tile
572             dst[i] = LOAD(pDst, { i });
573 
574             // load constant color
575             constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
576 
577             // load src
578             src[i] = LOAD(pSrc, { i });
579 
580             // load src1
581             src1[i] = LOAD(pSrc1, { i });
582         }
583         Value* currentSampleMask = VIMMED1(-1);
584         if (state.desc.alphaToCoverageEnable)
585         {
586             Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
587             uint32_t bits = (1 << state.desc.numSamples) - 1;
588             currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
589             currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
590         }
591 
592         // alpha test
593         if (state.desc.alphaTestEnable)
594         {
595             AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
596         }
597 
598         // color blend
599         if (state.blendState.blendEnable)
600         {
601             // clamp sources
602             Clamp(state.format, src);
603             Clamp(state.format, src1);
604             Clamp(state.format, dst);
605             Clamp(state.format, constantColor);
606 
607             // apply defaults to hottile contents to take into account missing components
608             ApplyDefaults(state.format, dst);
609 
610             // Force defaults for unused 'X' components
611             ApplyUnusedDefaults(state.format, dst);
612 
613             // Quantize low precision components
614             Quantize(state.format, dst);
615 
616             // special case clamping for R11G11B10_float which has no sign bit
617             if (state.format == R11G11B10_FLOAT)
618             {
619                 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
620                 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
621                 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
622                 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
623             }
624 
625             Value* srcFactor[4];
626             Value* dstFactor[4];
627             if (state.desc.independentAlphaBlendEnable)
628             {
629                 GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
630                 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
631 
632                 GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
633                 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
634 
635                 BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
636                 BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
637             }
638             else
639             {
640                 GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
641                 GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
642 
643                 BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
644             }
645 
646             // store results out
647             for (uint32_t i = 0; i < 4; ++i)
648             {
649                 STORE(result[i], pResult, { i });
650             }
651         }
652 
653         if(state.blendState.logicOpEnable)
654         {
655             const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
656             Value* vMask[4];
657             float scale[4];
658 
659             if (!state.blendState.blendEnable)
660             {
661                 Clamp(state.format, src);
662                 Clamp(state.format, dst);
663             }
664 
665             for(uint32_t i = 0; i < 4; i++)
666             {
667                 if (info.type[i] == SWR_TYPE_UNUSED)
668                 {
669                     continue;
670                 }
671 
672                 if (info.bpc[i] >= 32)
673                 {
674                     vMask[i] = VIMMED1(0xFFFFFFFF);
675                     scale[i] = 0xFFFFFFFF;
676                 }
677                 else
678                 {
679                     vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
680                     if (info.type[i] == SWR_TYPE_SNORM)
681                         scale[i] = (1 << (info.bpc[i] - 1)) - 1;
682                     else
683                         scale[i] = (1 << info.bpc[i]) - 1;
684                 }
685 
686                 switch (info.type[i])
687                 {
688                 default:
689                     SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
690                     break;
691 
692                 case SWR_TYPE_UNKNOWN:
693                 case SWR_TYPE_UNUSED:
694                     // fallthrough
695 
696                 case SWR_TYPE_UINT:
697                 case SWR_TYPE_SINT:
698                     src[i] = BITCAST(src[i], mSimdInt32Ty);
699                     dst[i] = BITCAST(dst[i], mSimdInt32Ty);
700                     break;
701                 case SWR_TYPE_SNORM:
702                     src[i] = FP_TO_SI(
703                         FMUL(src[i], VIMMED1(scale[i])),
704                         mSimdInt32Ty);
705                     dst[i] = FP_TO_SI(
706                         FMUL(dst[i], VIMMED1(scale[i])),
707                         mSimdInt32Ty);
708                     break;
709                 case SWR_TYPE_UNORM:
710                     src[i] = FP_TO_UI(
711                         FMUL(src[i], VIMMED1(scale[i])),
712                         mSimdInt32Ty);
713                     dst[i] = FP_TO_UI(
714                         FMUL(dst[i], VIMMED1(scale[i])),
715                         mSimdInt32Ty);
716                     break;
717                 }
718             }
719 
720             LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
721 
722             // store results out
723             for(uint32_t i = 0; i < 4; ++i)
724             {
725                 if (info.type[i] == SWR_TYPE_UNUSED)
726                 {
727                     continue;
728                 }
729 
730                 // clear upper bits from PS output not in RT format after doing logic op
731                 result[i] = AND(result[i], vMask[i]);
732 
733                 switch (info.type[i])
734                 {
735                 default:
736                     SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
737                     break;
738 
739                 case SWR_TYPE_UNKNOWN:
740                 case SWR_TYPE_UNUSED:
741                     // fallthrough
742 
743                 case SWR_TYPE_UINT:
744                 case SWR_TYPE_SINT:
745                     result[i] = BITCAST(result[i], mSimdFP32Ty);
746                     break;
747                 case SWR_TYPE_SNORM:
748                     result[i] = SHL(result[i], C(32 - info.bpc[i]));
749                     result[i] = ASHR(result[i], C(32 - info.bpc[i]));
750                     result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty),
751                                      VIMMED1(1.0f / scale[i]));
752                     break;
753                 case SWR_TYPE_UNORM:
754                     result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty),
755                                      VIMMED1(1.0f / scale[i]));
756                     break;
757                 }
758 
759                 STORE(result[i], pResult, {i});
760             }
761         }
762 
763         if(state.desc.oMaskEnable)
764         {
765             assert(!(state.desc.alphaToCoverageEnable));
766             // load current mask
767             Value* oMask = LOAD(ppoMask);
768             currentSampleMask = AND(oMask, currentSampleMask);
769         }
770 
771         if(state.desc.sampleMaskEnable)
772         {
773             Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
774             currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
775         }
776 
777         if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
778            state.desc.oMaskEnable)
779         {
780             // load coverage mask and mask off any lanes with no samples
781             Value* pMask = LOAD(ppMask);
782             Value* sampleMasked = SHL(C(1), sampleNum);
783             currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked));
784             currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
785             Value* outputMask = AND(pMask, currentSampleMask);
786             // store new mask
787             STORE(outputMask, GEP(ppMask, C(0)));
788         }
789 
790         RET_VOID();
791 
792         JitManager::DumpToFile(blendFunc, "");
793 
794         ::FunctionPassManager passes(JM()->mpCurrentModule);
795 
796         passes.add(createBreakCriticalEdgesPass());
797         passes.add(createCFGSimplificationPass());
798         passes.add(createEarlyCSEPass());
799         passes.add(createPromoteMemoryToRegisterPass());
800         passes.add(createCFGSimplificationPass());
801         passes.add(createEarlyCSEPass());
802         passes.add(createInstructionCombiningPass());
803         passes.add(createInstructionSimplifierPass());
804         passes.add(createConstantPropagationPass());
805         passes.add(createSCCPPass());
806         passes.add(createAggressiveDCEPass());
807 
808         passes.run(*blendFunc);
809 
810         JitManager::DumpToFile(blendFunc, "optimized");
811 
812         return blendFunc;
813     }
814 };
815 
816 //////////////////////////////////////////////////////////////////////////
817 /// @brief JITs from fetch shader IR
818 /// @param hJitMgr - JitManager handle
819 /// @param func   - LLVM function IR
820 /// @return PFN_FETCH_FUNC - pointer to fetch code
JitBlendFunc(HANDLE hJitMgr,const HANDLE hFunc)821 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
822 {
823     const llvm::Function *func = (const llvm::Function*)hFunc;
824     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
825     PFN_BLEND_JIT_FUNC pfnBlend;
826     pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
827     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
828     pJitMgr->mIsModuleFinalized = true;
829 
830     return pfnBlend;
831 }
832 
833 //////////////////////////////////////////////////////////////////////////
834 /// @brief JIT compiles blend shader
835 /// @param hJitMgr - JitManager handle
836 /// @param state   - blend state to build function from
JitCompileBlend(HANDLE hJitMgr,const BLEND_COMPILE_STATE & state)837 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
838 {
839     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
840 
841     pJitMgr->SetupNewModule();
842 
843     BlendJit theJit(pJitMgr);
844     HANDLE hFunc = theJit.Create(state);
845 
846     return JitBlendFunc(hJitMgr, hFunc);
847 }
848