1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "common/rdtsc_buckets.h"
33 
34 #include <cstdarg>
35 
36 namespace SwrJit
37 {
38     void __cdecl CallPrint(const char* fmt, ...);
39 
40     //////////////////////////////////////////////////////////////////////////
41     /// @brief Convert an IEEE 754 32-bit single precision float to an
42     ///        16 bit float with 5 exponent bits and a variable
43     ///        number of mantissa bits.
44     /// @param val - 32-bit float
45     /// @todo Maybe move this outside of this file into a header?
ConvertFloat32ToFloat16(float val)46     static uint16_t ConvertFloat32ToFloat16(float val)
47     {
48         uint32_t sign, exp, mant;
49         uint32_t roundBits;
50 
51         // Extract the sign, exponent, and mantissa
52         uint32_t uf = *(uint32_t*)&val;
53         sign = (uf & 0x80000000) >> 31;
54         exp = (uf & 0x7F800000) >> 23;
55         mant = uf & 0x007FFFFF;
56 
57         // Check for out of range
58         if (std::isnan(val))
59         {
60             exp = 0x1F;
61             mant = 0x200;
62             sign = 1;                     // set the sign bit for NANs
63         }
64         else if (std::isinf(val))
65         {
66             exp = 0x1f;
67             mant = 0x0;
68         }
69         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
70         {
71             exp = 0x1E;
72             mant = 0x3FF;
73         }
74         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
75         {
76             mant |= 0x00800000;
77             for (; exp <= 0x70; mant >>= 1, exp++)
78                 ;
79             exp = 0;
80             mant = mant >> 13;
81         }
82         else if (exp < 0x66) // Too small to represent -> Zero
83         {
84             exp = 0;
85             mant = 0;
86         }
87         else
88         {
89             // Saves bits that will be shifted off for rounding
90             roundBits = mant & 0x1FFFu;
91             // convert exponent and mantissa to 16 bit format
92             exp = exp - 0x70;
93             mant = mant >> 13;
94 
95             // Essentially RTZ, but round up if off by only 1 lsb
96             if (roundBits == 0x1FFFu)
97             {
98                 mant++;
99                 // check for overflow
100                 if ((mant & 0xC00u) != 0)
101                     exp++;
102                 // make sure only the needed bits are used
103                 mant &= 0x3FF;
104             }
105         }
106 
107         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
108         return (uint16_t)tmpVal;
109     }
110 
111     //////////////////////////////////////////////////////////////////////////
112     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
113     ///        float
114     /// @param val - 16-bit float
115     /// @todo Maybe move this outside of this file into a header?
ConvertFloat16ToFloat32(uint32_t val)116     static float ConvertFloat16ToFloat32(uint32_t val)
117     {
118         uint32_t result;
119         if ((val & 0x7fff) == 0)
120         {
121             result = ((uint32_t)(val & 0x8000)) << 16;
122         }
123         else if ((val & 0x7c00) == 0x7c00)
124         {
125             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
126             result |= ((uint32_t)val & 0x8000) << 16;
127         }
128         else
129         {
130             uint32_t sign = (val & 0x8000) << 16;
131             uint32_t mant = (val & 0x3ff) << 13;
132             uint32_t exp = (val >> 10) & 0x1f;
133             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
134             {
135                 mant <<= 1;
136                 while (mant < (0x400 << 13))
137                 {
138                     exp--;
139                     mant <<= 1;
140                 }
141                 mant &= (0x3ff << 13);
142             }
143             exp = ((exp - 15 + 127) & 0xff) << 23;
144             result = sign | exp | mant;
145         }
146 
147         return *(float*)&result;
148     }
149 
C(bool i)150     Constant *Builder::C(bool i)
151     {
152         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
153     }
154 
C(char i)155     Constant *Builder::C(char i)
156     {
157         return ConstantInt::get(IRB()->getInt8Ty(), i);
158     }
159 
C(uint8_t i)160     Constant *Builder::C(uint8_t i)
161     {
162         return ConstantInt::get(IRB()->getInt8Ty(), i);
163     }
164 
C(int i)165     Constant *Builder::C(int i)
166     {
167         return ConstantInt::get(IRB()->getInt32Ty(), i);
168     }
169 
C(int64_t i)170     Constant *Builder::C(int64_t i)
171     {
172         return ConstantInt::get(IRB()->getInt64Ty(), i);
173     }
174 
C(uint16_t i)175     Constant *Builder::C(uint16_t i)
176     {
177         return ConstantInt::get(mInt16Ty,i);
178     }
179 
C(uint32_t i)180     Constant *Builder::C(uint32_t i)
181     {
182         return ConstantInt::get(IRB()->getInt32Ty(), i);
183     }
184 
C(float i)185     Constant *Builder::C(float i)
186     {
187         return ConstantFP::get(IRB()->getFloatTy(), i);
188     }
189 
PRED(bool pred)190     Constant *Builder::PRED(bool pred)
191     {
192         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
193     }
194 
VIMMED1(int i)195     Value *Builder::VIMMED1(int i)
196     {
197         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
198     }
199 
VIMMED1_16(int i)200     Value *Builder::VIMMED1_16(int i)
201     {
202         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
203     }
204 
VIMMED1(uint32_t i)205     Value *Builder::VIMMED1(uint32_t i)
206     {
207         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
208     }
209 
VIMMED1_16(uint32_t i)210     Value *Builder::VIMMED1_16(uint32_t i)
211     {
212         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
213     }
214 
VIMMED1(float i)215     Value *Builder::VIMMED1(float i)
216     {
217         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
218     }
219 
VIMMED1_16(float i)220     Value *Builder::VIMMED1_16(float i)
221     {
222         return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
223     }
224 
VIMMED1(bool i)225     Value *Builder::VIMMED1(bool i)
226     {
227         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
228     }
229 
VIMMED1_16(bool i)230     Value *Builder::VIMMED1_16(bool i)
231     {
232         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
233     }
234 
VUNDEF_IPTR()235     Value *Builder::VUNDEF_IPTR()
236     {
237         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
238     }
239 
VUNDEF(Type * t)240     Value *Builder::VUNDEF(Type* t)
241     {
242         return UndefValue::get(VectorType::get(t, mVWidth));
243     }
244 
VUNDEF_I()245     Value *Builder::VUNDEF_I()
246     {
247         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
248     }
249 
VUNDEF_I_16()250     Value *Builder::VUNDEF_I_16()
251     {
252         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
253     }
254 
VUNDEF_F()255     Value *Builder::VUNDEF_F()
256     {
257         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
258     }
259 
VUNDEF_F_16()260     Value *Builder::VUNDEF_F_16()
261     {
262         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
263     }
264 
VUNDEF(Type * ty,uint32_t size)265     Value *Builder::VUNDEF(Type *ty, uint32_t size)
266     {
267         return UndefValue::get(VectorType::get(ty, size));
268     }
269 
VBROADCAST(Value * src,const llvm::Twine & name)270     Value *Builder::VBROADCAST(Value *src, const llvm::Twine& name)
271     {
272         // check if src is already a vector
273         if (src->getType()->isVectorTy())
274         {
275             return src;
276         }
277 
278         return VECTOR_SPLAT(mVWidth, src, name);
279     }
280 
VBROADCAST_16(Value * src)281     Value *Builder::VBROADCAST_16(Value *src)
282     {
283         // check if src is already a vector
284         if (src->getType()->isVectorTy())
285         {
286             return src;
287         }
288 
289         return VECTOR_SPLAT(mVWidth16, src);
290     }
291 
IMMED(Value * v)292     uint32_t Builder::IMMED(Value* v)
293     {
294         SWR_ASSERT(isa<ConstantInt>(v));
295         ConstantInt *pValConst = cast<ConstantInt>(v);
296         return pValConst->getZExtValue();
297     }
298 
S_IMMED(Value * v)299     int32_t Builder::S_IMMED(Value* v)
300     {
301         SWR_ASSERT(isa<ConstantInt>(v));
302         ConstantInt *pValConst = cast<ConstantInt>(v);
303         return pValConst->getSExtValue();
304     }
305 
GEP(Value * ptr,const std::initializer_list<Value * > & indexList)306     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
307     {
308         std::vector<Value*> indices;
309         for (auto i : indexList)
310             indices.push_back(i);
311         return GEPA(ptr, indices);
312     }
313 
GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList)314     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
315     {
316         std::vector<Value*> indices;
317         for (auto i : indexList)
318             indices.push_back(C(i));
319         return GEPA(ptr, indices);
320     }
321 
IN_BOUNDS_GEP(Value * ptr,const std::initializer_list<Value * > & indexList)322     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
323     {
324         std::vector<Value*> indices;
325         for (auto i : indexList)
326             indices.push_back(i);
327         return IN_BOUNDS_GEP(ptr, indices);
328     }
329 
IN_BOUNDS_GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList)330     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
331     {
332         std::vector<Value*> indices;
333         for (auto i : indexList)
334             indices.push_back(C(i));
335         return IN_BOUNDS_GEP(ptr, indices);
336     }
337 
LOAD(Value * basePtr,const std::initializer_list<uint32_t> & indices,const llvm::Twine & name)338     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
339     {
340         std::vector<Value*> valIndices;
341         for (auto i : indices)
342             valIndices.push_back(C(i));
343         return LOAD(GEPA(basePtr, valIndices), name);
344     }
345 
LOADV(Value * basePtr,const std::initializer_list<Value * > & indices,const llvm::Twine & name)346     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
347     {
348         std::vector<Value*> valIndices;
349         for (auto i : indices)
350             valIndices.push_back(i);
351         return LOAD(GEPA(basePtr, valIndices), name);
352     }
353 
STORE(Value * val,Value * basePtr,const std::initializer_list<uint32_t> & indices)354     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
355     {
356         std::vector<Value*> valIndices;
357         for (auto i : indices)
358             valIndices.push_back(C(i));
359         return STORE(val, GEPA(basePtr, valIndices));
360     }
361 
STOREV(Value * val,Value * basePtr,const std::initializer_list<Value * > & indices)362     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
363     {
364         std::vector<Value*> valIndices;
365         for (auto i : indices)
366             valIndices.push_back(i);
367         return STORE(val, GEPA(basePtr, valIndices));
368     }
369 
CALL(Value * Callee,const std::initializer_list<Value * > & argsList,const llvm::Twine & name)370     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
371     {
372         std::vector<Value*> args;
373         for (auto arg : argsList)
374             args.push_back(arg);
375         return CALLA(Callee, args, name);
376     }
377 
CALL(Value * Callee,Value * arg)378     CallInst *Builder::CALL(Value *Callee, Value* arg)
379     {
380         std::vector<Value*> args;
381         args.push_back(arg);
382         return CALLA(Callee, args);
383     }
384 
CALL2(Value * Callee,Value * arg1,Value * arg2)385     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
386     {
387         std::vector<Value*> args;
388         args.push_back(arg1);
389         args.push_back(arg2);
390         return CALLA(Callee, args);
391     }
392 
CALL3(Value * Callee,Value * arg1,Value * arg2,Value * arg3)393     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
394     {
395         std::vector<Value*> args;
396         args.push_back(arg1);
397         args.push_back(arg2);
398         args.push_back(arg3);
399         return CALLA(Callee, args);
400     }
401 
402     //////////////////////////////////////////////////////////////////////////
DEBUGTRAP()403     Value *Builder::DEBUGTRAP()
404     {
405         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
406         return CALL(func);
407     }
408 
VRCP(Value * va,const llvm::Twine & name)409     Value *Builder::VRCP(Value *va, const llvm::Twine& name)
410     {
411         return FDIV(VIMMED1(1.0f), va, name);  // 1 / a
412     }
413 
VPLANEPS(Value * vA,Value * vB,Value * vC,Value * & vX,Value * & vY)414     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
415     {
416         Value* vOut = FMADDPS(vA, vX, vC);
417         vOut = FMADDPS(vB, vY, vOut);
418         return vOut;
419     }
420 
421     //////////////////////////////////////////////////////////////////////////
422     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
423     /// supported on the underlying platform, emulate it with float masked load
424     /// @param src - base address pointer for the load
425     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
MASKLOADD(Value * src,Value * mask)426     Value *Builder::MASKLOADD(Value* src,Value* mask)
427     {
428         Value* vResult;
429         // use avx2 gather instruction is available
430         if(JM()->mArch.AVX2())
431         {
432             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
433             vResult = CALL(func,{src,mask});
434         }
435         else
436         {
437             // maskload intrinsic expects integer mask operand in llvm >= 3.8
438     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
439             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
440     #else
441             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
442     #endif
443             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
444             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
445         }
446         return vResult;
447     }
448 
449     //////////////////////////////////////////////////////////////////////////
450     /// @brief insert a JIT call to CallPrint
451     /// - outputs formatted string to both stdout and VS output window
452     /// - DEBUG builds only
453     /// Usage example:
454     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
455     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
456     ///   result from a GEP, printing out the pointer to memory
457     /// @param printStr - constant string to print, which includes format specifiers
458     /// @param printArgs - initializer list of Value*'s to print to std out
PRINT(const std::string & printStr,const std::initializer_list<Value * > & printArgs)459     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
460     {
461         // push the arguments to CallPrint into a vector
462         std::vector<Value*> printCallArgs;
463         // save room for the format string.  we still need to modify it for vectors
464         printCallArgs.resize(1);
465 
466         // search through the format string for special processing
467         size_t pos = 0;
468         std::string tempStr(printStr);
469         pos = tempStr.find('%', pos);
470         auto v = printArgs.begin();
471 
472         while ((pos != std::string::npos) && (v != printArgs.end()))
473         {
474             Value* pArg = *v;
475             Type* pType = pArg->getType();
476 
477             if (pType->isVectorTy())
478             {
479                 Type* pContainedType = pType->getContainedType(0);
480 
481                 if (toupper(tempStr[pos + 1]) == 'X')
482                 {
483                     tempStr[pos] = '0';
484                     tempStr[pos + 1] = 'x';
485                     tempStr.insert(pos + 2, "%08X ");
486                     pos += 7;
487 
488                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
489 
490                     std::string vectorFormatStr;
491                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
492                     {
493                         vectorFormatStr += "0x%08X ";
494                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
495                     }
496 
497                     tempStr.insert(pos, vectorFormatStr);
498                     pos += vectorFormatStr.size();
499                 }
500                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
501                 {
502                     uint32_t i = 0;
503                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
504                     {
505                         tempStr.insert(pos, std::string("%f "));
506                         pos += 3;
507                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
508                     }
509                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
510                 }
511                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
512                 {
513                     uint32_t i = 0;
514                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
515                     {
516                         tempStr.insert(pos, std::string("%d "));
517                         pos += 3;
518                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
519                     }
520                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
521                 }
522             }
523             else
524             {
525                 if (toupper(tempStr[pos + 1]) == 'X')
526                 {
527                     tempStr[pos] = '0';
528                     tempStr.insert(pos + 1, "x%08");
529                     printCallArgs.push_back(pArg);
530                     pos += 3;
531                 }
532                 // for %f we need to cast float Values to doubles so that they print out correctly
533                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
534                 {
535                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
536                     pos++;
537                 }
538                 else
539                 {
540                     printCallArgs.push_back(pArg);
541                 }
542             }
543 
544             // advance to the next arguement
545             v++;
546             pos = tempStr.find('%', ++pos);
547         }
548 
549         // create global variable constant string
550         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
551         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
552         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
553 
554         // get a pointer to the first character in the constant string array
555         std::vector<Constant*> geplist{C(0),C(0)};
556         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
557 
558         // insert the pointer to the format string in the argument vector
559         printCallArgs[0] = strGEP;
560 
561         // get pointer to CallPrint function and insert decl into the module if needed
562         std::vector<Type*> args;
563         args.push_back(PointerType::get(mInt8Ty,0));
564         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
565         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
566 
567         // if we haven't yet added the symbol to the symbol table
568         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
569         {
570             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
571         }
572 
573         // insert a call to CallPrint
574         return CALLA(callPrintFn,printCallArgs);
575     }
576 
577     //////////////////////////////////////////////////////////////////////////
578     /// @brief Wrapper around PRINT with initializer list.
PRINT(const std::string & printStr)579     CallInst* Builder::PRINT(const std::string &printStr)
580     {
581         return PRINT(printStr, {});
582     }
583 
584     //////////////////////////////////////////////////////////////////////////
585     /// @brief Generate a masked gather operation in LLVM IR.  If not
586     /// supported on the underlying platform, emulate it with loads
587     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
588     /// @param pBase - Int8* base VB address pointer value
589     /// @param vIndices - SIMD wide value of VB byte offsets
590     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
591     /// @param scale - value to scale indices by
GATHERPS(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)592     Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
593     {
594         Value *vGather;
595 
596         // use avx2 gather instruction if available
597         if(JM()->mArch.AVX2())
598         {
599             // force mask to <N x float>, required by vgather
600             Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
601 
602             vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
603         }
604         else
605         {
606             Value* pStack = STACKSAVE();
607 
608             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
609             Value* vSrcPtr = ALLOCA(vSrc->getType());
610             STORE(vSrc, vSrcPtr);
611 
612             vGather = VUNDEF_F();
613             Value *vScaleVec = VIMMED1((uint32_t)scale);
614             Value *vOffsets = MUL(vIndices,vScaleVec);
615             for(uint32_t i = 0; i < mVWidth; ++i)
616             {
617                 // single component byte index
618                 Value *offset = VEXTRACT(vOffsets,C(i));
619                 // byte pointer to component
620                 Value *loadAddress = GEP(pBase,offset);
621                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
622                 // pointer to the value to load if we're masking off a component
623                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
624                 Value *selMask = VEXTRACT(vMask,C(i));
625                 // switch in a safe address to load if we're trying to access a vertex
626                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
627                 Value *val = LOAD(validAddress);
628                 vGather = VINSERT(vGather,val,C(i));
629             }
630 
631             STACKRESTORE(pStack);
632         }
633 
634         return vGather;
635     }
636 
GATHERPS_16(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)637     Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
638     {
639         Value *vGather = VUNDEF_F_16();
640 
641         // use AVX512F gather instruction if available
642         if (JM()->mArch.AVX512F())
643         {
644             // force mask to <N-bit Integer>, required by vgather2
645             Value *mask = BITCAST(vMask, mInt16Ty);
646 
647             vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
648         }
649         else
650         {
651             Value *src0 = EXTRACT_16(vSrc, 0);
652             Value *src1 = EXTRACT_16(vSrc, 1);
653 
654             Value *indices0 = EXTRACT_16(vIndices, 0);
655             Value *indices1 = EXTRACT_16(vIndices, 1);
656 
657             Value *mask0 = EXTRACT_16(vMask, 0);
658             Value *mask1 = EXTRACT_16(vMask, 1);
659 
660             Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
661             Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
662 
663             vGather = JOIN_16(gather0, gather1);
664         }
665 
666         return vGather;
667     }
668 
669     //////////////////////////////////////////////////////////////////////////
670     /// @brief Generate a masked gather operation in LLVM IR.  If not
671     /// supported on the underlying platform, emulate it with loads
672     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
673     /// @param pBase - Int8* base VB address pointer value
674     /// @param vIndices - SIMD wide value of VB byte offsets
675     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
676     /// @param scale - value to scale indices by
GATHERDD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)677     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
678     {
679         Value* vGather;
680 
681         // use avx2 gather instruction if available
682         if(JM()->mArch.AVX2())
683         {
684             vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
685         }
686         else
687         {
688             Value* pStack = STACKSAVE();
689 
690             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
691             Value* vSrcPtr = ALLOCA(vSrc->getType());
692             STORE(vSrc, vSrcPtr);
693 
694             vGather = VUNDEF_I();
695             Value *vScaleVec = VIMMED1((uint32_t)scale);
696             Value *vOffsets = MUL(vIndices, vScaleVec);
697             for(uint32_t i = 0; i < mVWidth; ++i)
698             {
699                 // single component byte index
700                 Value *offset = VEXTRACT(vOffsets, C(i));
701                 // byte pointer to component
702                 Value *loadAddress = GEP(pBase, offset);
703                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
704                 // pointer to the value to load if we're masking off a component
705                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
706                 Value *selMask = VEXTRACT(vMask, C(i));
707                 // switch in a safe address to load if we're trying to access a vertex
708                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
709                 Value *val = LOAD(validAddress, C(0));
710                 vGather = VINSERT(vGather, val, C(i));
711             }
712 
713             STACKRESTORE(pStack);
714         }
715 
716         return vGather;
717     }
718 
GATHERDD_16(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)719     Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
720     {
721         Value *vGather = VUNDEF_I_16();
722 
723         // use AVX512F gather instruction if available
724         if (JM()->mArch.AVX512F())
725         {
726             // force mask to <N-bit Integer>, required by vgather2
727             Value *mask = BITCAST(vMask, mInt16Ty);
728 
729             vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
730         }
731         else
732         {
733             Value *src0 = EXTRACT_16(vSrc, 0);
734             Value *src1 = EXTRACT_16(vSrc, 1);
735 
736             Value *indices0 = EXTRACT_16(vIndices, 0);
737             Value *indices1 = EXTRACT_16(vIndices, 1);
738 
739             Value *mask0 = EXTRACT_16(vMask, 0);
740             Value *mask1 = EXTRACT_16(vMask, 1);
741 
742             Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
743             Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
744 
745             vGather = JOIN_16(gather0, gather1);
746         }
747 
748         return vGather;
749     }
750 
751     //////////////////////////////////////////////////////////////////////////
752     /// @brief Generate a masked gather operation in LLVM IR.  If not
753     /// supported on the underlying platform, emulate it with loads
754     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
755     /// @param pBase - Int8* base VB address pointer value
756     /// @param vIndices - SIMD wide value of VB byte offsets
757     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
758     /// @param scale - value to scale indices by
GATHERPD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)759     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
760     {
761         Value* vGather;
762 
763         // use avx2 gather instruction if available
764         if(JM()->mArch.AVX2())
765         {
766             vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
767             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
768         }
769         else
770         {
771             Value* pStack = STACKSAVE();
772 
773             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
774             Value* vSrcPtr = ALLOCA(vSrc->getType());
775             STORE(vSrc, vSrcPtr);
776 
777             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
778             Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
779             Value *vOffsets = MUL(vIndices,vScaleVec);
780             for(uint32_t i = 0; i < mVWidth/2; ++i)
781             {
782                 // single component byte index
783                 Value *offset = VEXTRACT(vOffsets,C(i));
784                 // byte pointer to component
785                 Value *loadAddress = GEP(pBase,offset);
786                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
787                 // pointer to the value to load if we're masking off a component
788                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
789                 Value *selMask = VEXTRACT(vMask,C(i));
790                 // switch in a safe address to load if we're trying to access a vertex
791                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
792                 Value *val = LOAD(validAddress);
793                 vGather = VINSERT(vGather,val,C(i));
794             }
795             STACKRESTORE(pStack);
796         }
797         return vGather;
798     }
799 
EXTRACT_16(Value * x,uint32_t imm)800     Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
801     {
802         if (imm == 0)
803         {
804             return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
805         }
806         else
807         {
808             return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
809         }
810     }
811 
JOIN_16(Value * a,Value * b)812     Value *Builder::JOIN_16(Value *a, Value *b)
813     {
814         return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
815     }
816 
817     //////////////////////////////////////////////////////////////////////////
818     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
MASK(Value * vmask)819     Value *Builder::MASK(Value *vmask)
820     {
821         Value *src = BITCAST(vmask, mSimdInt32Ty);
822         return ICMP_SLT(src, VIMMED1(0));
823     }
824 
MASK_16(Value * vmask)825     Value *Builder::MASK_16(Value *vmask)
826     {
827         Value *src = BITCAST(vmask, mSimd16Int32Ty);
828         return ICMP_SLT(src, VIMMED1_16(0));
829     }
830 
831     //////////////////////////////////////////////////////////////////////////
832     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
VMASK(Value * mask)833     Value *Builder::VMASK(Value *mask)
834     {
835         return S_EXT(mask, mSimdInt32Ty);
836     }
837 
VMASK_16(Value * mask)838     Value *Builder::VMASK_16(Value *mask)
839     {
840         return S_EXT(mask, mSimd16Int32Ty);
841     }
842 
843     //////////////////////////////////////////////////////////////////////////
844     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
845     /// supported on the underlying platform, emulate it
846     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
847     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
848     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
849     /// 128bits of a, and vice versa for the upper lanes.  If the mask
850     /// value is negative, '0' is inserted.
PSHUFB(Value * a,Value * b)851     Value *Builder::PSHUFB(Value* a, Value* b)
852     {
853         Value* res;
854         // use avx2 pshufb instruction if available
855         if(JM()->mArch.AVX2())
856         {
857             res = VPSHUFB(a, b);
858         }
859         else
860         {
861             Constant* cB = dyn_cast<Constant>(b);
862             // number of 8 bit elements in b
863             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
864             // output vector
865             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
866 
867             // insert an 8 bit value from the high and low lanes of a per loop iteration
868             numElms /= 2;
869             for(uint32_t i = 0; i < numElms; i++)
870             {
871                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
872                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
873 
874                 // extract values from constant mask
875                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
876                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
877 
878                 Value* insertValLow128b;
879                 Value* insertValHigh128b;
880 
881                 // if the mask value is negative, insert a '0' in the respective output position
882                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
883                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
884                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
885 
886                 vShuf = VINSERT(vShuf, insertValLow128b, i);
887                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
888             }
889             res = vShuf;
890         }
891         return res;
892     }
893 
894     //////////////////////////////////////////////////////////////////////////
895     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
896     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
897     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
898     /// lower 8 values are used.
PMOVSXBD(Value * a)899     Value *Builder::PMOVSXBD(Value* a)
900     {
901         // VPMOVSXBD output type
902         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
903         // Extract 8 values from 128bit lane and sign extend
904         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
905     }
906 
907     //////////////////////////////////////////////////////////////////////////
908     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
909     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
910     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
PMOVSXWD(Value * a)911     Value *Builder::PMOVSXWD(Value* a)
912     {
913         // VPMOVSXWD output type
914         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
915         // Extract 8 values from 128bit lane and sign extend
916         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
917     }
918 
919     //////////////////////////////////////////////////////////////////////////
920     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
921     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
922     /// platform, emulate it
923     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
924     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
PERMD(Value * a,Value * idx)925     Value *Builder::PERMD(Value* a, Value* idx)
926     {
927         Value* res;
928         // use avx2 permute instruction if available
929         if(JM()->mArch.AVX2())
930         {
931             res = VPERMD(a, idx);
932         }
933         else
934         {
935             if (isa<Constant>(idx))
936             {
937                 res = VSHUFFLE(a, a, idx);
938             }
939             else
940             {
941                 res = VUNDEF_I();
942                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
943                 {
944                     Value* pIndex = VEXTRACT(idx, C(l));
945                     Value* pVal = VEXTRACT(a, pIndex);
946                     res = VINSERT(res, pVal, C(l));
947                 }
948             }
949         }
950         return res;
951     }
952 
953     //////////////////////////////////////////////////////////////////////////
954     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
955     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
956     /// platform, emulate it
957     /// @param a - 256bit SIMD lane(8x32bit) of float values.
958     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
PERMPS(Value * a,Value * idx)959     Value *Builder::PERMPS(Value* a, Value* idx)
960     {
961         Value* res;
962         // use avx2 permute instruction if available
963         if (JM()->mArch.AVX2())
964         {
965             // llvm 3.6.0 swapped the order of the args to vpermd
966             res = VPERMPS(idx, a);
967         }
968         else
969         {
970             if (isa<Constant>(idx))
971             {
972                 res = VSHUFFLE(a, a, idx);
973             }
974             else
975             {
976                 res = VUNDEF_F();
977                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
978                 {
979                     Value* pIndex = VEXTRACT(idx, C(l));
980                     Value* pVal = VEXTRACT(a, pIndex);
981                     res = VINSERT(res, pVal, C(l));
982                 }
983             }
984         }
985 
986         return res;
987     }
988 
989     //////////////////////////////////////////////////////////////////////////
990     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
991     /// in LLVM IR.  If not supported on the underlying platform, emulate it
992     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
CVTPH2PS(Value * a,const llvm::Twine & name)993     Value *Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
994     {
995         if (JM()->mArch.F16C())
996         {
997             return VCVTPH2PS(a, name);
998         }
999         else
1000         {
1001             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
1002             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
1003 
1004             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
1005             {
1006                 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
1007             }
1008 
1009             Value* pResult = UndefValue::get(mSimdFP32Ty);
1010             for (uint32_t i = 0; i < mVWidth; ++i)
1011             {
1012                 Value* pSrc = VEXTRACT(a, C(i));
1013                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
1014                 pResult = VINSERT(pResult, pConv, C(i));
1015             }
1016 
1017             pResult->setName(name);
1018             return pResult;
1019         }
1020     }
1021 
1022     //////////////////////////////////////////////////////////////////////////
1023     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
1024     /// in LLVM IR.  If not supported on the underlying platform, emulate it
1025     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
CVTPS2PH(Value * a,Value * rounding)1026     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
1027     {
1028         if (JM()->mArch.F16C())
1029         {
1030             return VCVTPS2PH(a, rounding);
1031         }
1032         else
1033         {
1034             // call scalar C function for now
1035             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
1036             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
1037 
1038             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
1039             {
1040                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
1041             }
1042 
1043             Value* pResult = UndefValue::get(mSimdInt16Ty);
1044             for (uint32_t i = 0; i < mVWidth; ++i)
1045             {
1046                 Value* pSrc = VEXTRACT(a, C(i));
1047                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
1048                 pResult = VINSERT(pResult, pConv, C(i));
1049             }
1050 
1051             return pResult;
1052         }
1053     }
1054 
PMAXSD(Value * a,Value * b)1055     Value *Builder::PMAXSD(Value* a, Value* b)
1056     {
1057         Value* cmp = ICMP_SGT(a, b);
1058         return SELECT(cmp, a, b);
1059     }
1060 
PMINSD(Value * a,Value * b)1061     Value *Builder::PMINSD(Value* a, Value* b)
1062     {
1063         Value* cmp = ICMP_SLT(a, b);
1064         return SELECT(cmp, a, b);
1065     }
1066 
Gather4(const SWR_FORMAT format,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput)1067     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1068                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1069     {
1070         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1071         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1072         {
1073             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1074         }
1075         else
1076         {
1077             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1078         }
1079     }
1080 
GATHER4PS(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * vMask,Value * vGatherComponents[],bool bPackedOutput)1081     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1082                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1083     {
1084         switch(info.bpp / info.numComps)
1085         {
1086             case 16:
1087             {
1088                     Value* vGatherResult[2];
1089 
1090                     // TODO: vGatherMaskedVal
1091                     Value* vGatherMaskedVal = VIMMED1((float)0);
1092 
1093                     // always have at least one component out of x or y to fetch
1094 
1095                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1096                     // e.g. result of first 8x32bit integer gather for 16bit components
1097                     // 256i - 0    1    2    3    4    5    6    7
1098                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1099                     //
1100 
1101                     // if we have at least one component out of x or y to fetch
1102                     if(info.numComps > 2)
1103                     {
1104                         // offset base to the next components(zw) in the vertex to gather
1105                         pSrcBase = GEP(pSrcBase, C((char)4));
1106 
1107                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1108                         // e.g. result of second 8x32bit integer gather for 16bit components
1109                         // 256i - 0    1    2    3    4    5    6    7
1110                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1111                         //
1112                     }
1113                     else
1114                     {
1115                         vGatherResult[1] =  vGatherMaskedVal;
1116                     }
1117 
1118                     // Shuffle gathered components into place, each row is a component
1119                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1120             }
1121                 break;
1122             case 32:
1123             {
1124                 // apply defaults
1125                 for (uint32_t i = 0; i < 4; ++i)
1126                 {
1127                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1128                 }
1129 
1130                 for(uint32_t i = 0; i < info.numComps; i++)
1131                 {
1132                     uint32_t swizzleIndex = info.swizzle[i];
1133 
1134                     // Gather a SIMD of components
1135                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1136 
1137                     // offset base to the next component to gather
1138                     pSrcBase = GEP(pSrcBase, C((char)4));
1139                 }
1140             }
1141                 break;
1142             default:
1143                 SWR_INVALID("Invalid float format");
1144                 break;
1145         }
1146     }
1147 
GATHER4DD(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * vMask,Value * vGatherComponents[],bool bPackedOutput)1148     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1149                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1150     {
1151         switch (info.bpp / info.numComps)
1152         {
1153             case 8:
1154             {
1155                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1156                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1157                 // e.g. result of an 8x32bit integer gather for 8bit components
1158                 // 256i - 0    1    2    3    4    5    6    7
1159                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1160 
1161                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1162             }
1163                 break;
1164             case 16:
1165             {
1166                 Value* vGatherResult[2];
1167 
1168                 // TODO: vGatherMaskedVal
1169                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1170 
1171                 // always have at least one component out of x or y to fetch
1172 
1173                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1174                 // e.g. result of first 8x32bit integer gather for 16bit components
1175                 // 256i - 0    1    2    3    4    5    6    7
1176                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1177                 //
1178 
1179                 // if we have at least one component out of x or y to fetch
1180                 if(info.numComps > 2)
1181                 {
1182                     // offset base to the next components(zw) in the vertex to gather
1183                     pSrcBase = GEP(pSrcBase, C((char)4));
1184 
1185                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1186                     // e.g. result of second 8x32bit integer gather for 16bit components
1187                     // 256i - 0    1    2    3    4    5    6    7
1188                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1189                     //
1190                 }
1191                 else
1192                 {
1193                     vGatherResult[1] = vGatherMaskedVal;
1194                 }
1195 
1196                 // Shuffle gathered components into place, each row is a component
1197                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1198 
1199             }
1200                 break;
1201             case 32:
1202             {
1203                 // apply defaults
1204                 for (uint32_t i = 0; i < 4; ++i)
1205                 {
1206                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1207                 }
1208 
1209                 for(uint32_t i = 0; i < info.numComps; i++)
1210                 {
1211                     uint32_t swizzleIndex = info.swizzle[i];
1212 
1213                     // Gather a SIMD of components
1214                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1215 
1216                     // offset base to the next component to gather
1217                     pSrcBase = GEP(pSrcBase, C((char)4));
1218                 }
1219             }
1220                 break;
1221             default:
1222                 SWR_INVALID("unsupported format");
1223             break;
1224         }
1225     }
1226 
Shuffle16bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput[2],Value * vGatherOutput[4],bool bPackedOutput)1227     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1228     {
1229         // cast types
1230         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1231         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1232 
1233         // input could either be float or int vector; do shuffle work in int
1234         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1235         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1236 
1237         if(bPackedOutput)
1238         {
1239             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1240 
1241             // shuffle mask
1242             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1243                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1244             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1245             // after pshufb: group components together in each 128bit lane
1246             // 256i - 0    1    2    3    4    5    6    7
1247             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1248 
1249             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1250             // after PERMD: move and pack xy components into each 128bit lane
1251             // 256i - 0    1    2    3    4    5    6    7
1252             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1253 
1254             // do the same for zw components
1255             Value* vi128ZW = nullptr;
1256             if(info.numComps > 2)
1257             {
1258                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1259                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1260             }
1261 
1262             for(uint32_t i = 0; i < 4; i++)
1263             {
1264                 uint32_t swizzleIndex = info.swizzle[i];
1265                 // todo: fixed for packed
1266                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1267                 if(i >= info.numComps)
1268                 {
1269                     // set the default component val
1270                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1271                     continue;
1272                 }
1273 
1274                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1275                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1276                 // if x or y, use vi128XY permute result, else use vi128ZW
1277                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1278 
1279                 // extract packed component 128 bit lanes
1280                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1281             }
1282 
1283         }
1284         else
1285         {
1286             // pshufb masks for each component
1287             Value* vConstMask[2];
1288             // x/z shuffle mask
1289             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1290                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1291 
1292             // y/w shuffle mask
1293             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1294                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1295 
1296 
1297             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1298             // apply defaults
1299             for (uint32_t i = 0; i < 4; ++i)
1300             {
1301                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1302             }
1303 
1304             for(uint32_t i = 0; i < info.numComps; i++)
1305             {
1306                 uint32_t swizzleIndex = info.swizzle[i];
1307 
1308                 // select correct constMask for x/z or y/w pshufb
1309                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1310                 // if x or y, use vi128XY permute result, else use vi128ZW
1311                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1312 
1313                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1314                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1315                 // 256i - 0    1    2    3    4    5    6    7
1316                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1317             }
1318         }
1319     }
1320 
Shuffle8bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput,Value * vGatherOutput[],bool bPackedOutput)1321     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1322     {
1323         // cast types
1324         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1325         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1326 
1327         if(bPackedOutput)
1328         {
1329             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1330             // shuffle mask
1331             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1332                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1333             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1334             // after pshufb: group components together in each 128bit lane
1335             // 256i - 0    1    2    3    4    5    6    7
1336             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1337 
1338             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1339             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1340             // 256i - 0    1    2    3    4    5    6    7
1341             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1342 
1343             // do the same for zw components
1344             Value* vi128ZW = nullptr;
1345             if(info.numComps > 2)
1346             {
1347                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1348             }
1349 
1350             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1351             for(uint32_t i = 0; i < 4; i++)
1352             {
1353                 uint32_t swizzleIndex = info.swizzle[i];
1354                 // todo: fix for packed
1355                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1356                 if(i >= info.numComps)
1357                 {
1358                     // set the default component val
1359                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1360                     continue;
1361                 }
1362 
1363                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1364                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1365                 // if x or y, use vi128XY permute result, else use vi128ZW
1366                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1367 
1368                 // sign extend
1369                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1370             }
1371         }
1372         // else zero extend
1373         else{
1374             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1375             // apply defaults
1376             for (uint32_t i = 0; i < 4; ++i)
1377             {
1378                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1379             }
1380 
1381             for(uint32_t i = 0; i < info.numComps; i++){
1382                 uint32_t swizzleIndex = info.swizzle[i];
1383 
1384                 // pshufb masks for each component
1385                 Value* vConstMask;
1386                 switch(i)
1387                 {
1388                     case 0:
1389                         // x shuffle mask
1390                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1391                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1392                         break;
1393                     case 1:
1394                         // y shuffle mask
1395                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1396                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1397                         break;
1398                     case 2:
1399                         // z shuffle mask
1400                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1401                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1402                         break;
1403                     case 3:
1404                         // w shuffle mask
1405                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1406                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1407                         break;
1408                     default:
1409                         vConstMask = nullptr;
1410                         break;
1411                 }
1412 
1413                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1414                     // after pshufb for x channel
1415                     // 256i - 0    1    2    3    4    5    6    7
1416                     //        x000 x000 x000 x000 x000 x000 x000 x000
1417             }
1418         }
1419     }
1420 
1421     // Helper function to create alloca in entry block of function
CreateEntryAlloca(Function * pFunc,Type * pType)1422     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1423     {
1424         auto saveIP = IRB()->saveIP();
1425         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1426                               pFunc->getEntryBlock().begin());
1427         Value* pAlloca = ALLOCA(pType);
1428         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1429         return pAlloca;
1430     }
1431 
CreateEntryAlloca(Function * pFunc,Type * pType,Value * pArraySize)1432     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1433     {
1434         auto saveIP = IRB()->saveIP();
1435         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1436             pFunc->getEntryBlock().begin());
1437         Value* pAlloca = ALLOCA(pType, pArraySize);
1438         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1439         return pAlloca;
1440     }
1441 
1442     //////////////////////////////////////////////////////////////////////////
1443     /// @brief emulates a scatter operation.
1444     /// @param pDst - pointer to destination
1445     /// @param vSrc - vector of src data to scatter
1446     /// @param vOffsets - vector of byte offsets from pDst
1447     /// @param vMask - mask of valid lanes
SCATTERPS(Value * pDst,Value * vSrc,Value * vOffsets,Value * vMask)1448     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1449     {
1450         /* Scatter algorithm
1451 
1452            while(Index = BitScanForward(mask))
1453                 srcElem = srcVector[Index]
1454                 offsetElem = offsetVector[Index]
1455                 *(pDst + offsetElem) = srcElem
1456                 Update mask (&= ~(1<<Index)
1457 
1458         */
1459 
1460         BasicBlock* pCurBB = IRB()->GetInsertBlock();
1461         Function* pFunc = pCurBB->getParent();
1462         Type* pSrcTy = vSrc->getType()->getVectorElementType();
1463 
1464         // Store vectors on stack
1465         if (pScatterStackSrc == nullptr)
1466         {
1467             // Save off stack allocations and reuse per scatter. Significantly reduces stack
1468             // requirements for shaders with a lot of scatters.
1469             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1470             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1471         }
1472 
1473         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1474         Value* pOffsetsArrayPtr = pScatterStackOffsets;
1475         STORE(vSrc, pSrcArrayPtr);
1476         STORE(vOffsets, pOffsetsArrayPtr);
1477 
1478         // Cast to pointers for random access
1479         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1480         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1481 
1482         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1483 
1484         // Get cttz function
1485         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1486 
1487         // Setup loop basic block
1488         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
1489 
1490         // compute first set bit
1491         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1492 
1493         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1494 
1495         // Split current block
1496         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1497 
1498         // Remove unconditional jump created by splitBasicBlock
1499         pCurBB->getTerminator()->eraseFromParent();
1500 
1501         // Add terminator to end of original block
1502         IRB()->SetInsertPoint(pCurBB);
1503 
1504         // Add conditional branch
1505         COND_BR(pIsUndef, pPostLoop, pLoop);
1506 
1507         // Add loop basic block contents
1508         IRB()->SetInsertPoint(pLoop);
1509         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1510         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1511 
1512         pIndexPhi->addIncoming(pIndex, pCurBB);
1513         pMaskPhi->addIncoming(pMask, pCurBB);
1514 
1515         // Extract elements for this index
1516         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1517         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1518 
1519         // GEP to this offset in dst
1520         Value* pCurDst = GEP(pDst, pOffsetElem);
1521         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1522         STORE(pSrcElem, pCurDst);
1523 
1524         // Update the mask
1525         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1526 
1527         // Terminator
1528         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1529 
1530         pIsUndef = ICMP_EQ(pNewIndex, C(32));
1531         COND_BR(pIsUndef, pPostLoop, pLoop);
1532 
1533         // Update phi edges
1534         pIndexPhi->addIncoming(pNewIndex, pLoop);
1535         pMaskPhi->addIncoming(pNewMask, pLoop);
1536 
1537         // Move builder to beginning of post loop
1538         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1539     }
1540 
VABSPS(Value * a)1541     Value* Builder::VABSPS(Value* a)
1542     {
1543         Value* asInt = BITCAST(a, mSimdInt32Ty);
1544         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1545         return result;
1546     }
1547 
ICLAMP(Value * src,Value * low,Value * high,const llvm::Twine & name)1548     Value *Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
1549     {
1550         Value *lowCmp = ICMP_SLT(src, low);
1551         Value *ret = SELECT(lowCmp, low, src);
1552 
1553         Value *highCmp = ICMP_SGT(ret, high);
1554         ret = SELECT(highCmp, high, ret, name);
1555 
1556         return ret;
1557     }
1558 
FCLAMP(Value * src,Value * low,Value * high)1559     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1560     {
1561         Value *lowCmp = FCMP_OLT(src, low);
1562         Value *ret = SELECT(lowCmp, low, src);
1563 
1564         Value *highCmp = FCMP_OGT(ret, high);
1565         ret = SELECT(highCmp, high, ret);
1566 
1567         return ret;
1568     }
1569 
FCLAMP(Value * src,float low,float high)1570     Value *Builder::FCLAMP(Value* src, float low, float high)
1571     {
1572         Value* result = VMAXPS(src, VIMMED1(low));
1573         result = VMINPS(result, VIMMED1(high));
1574 
1575         return result;
1576     }
1577 
1578     //////////////////////////////////////////////////////////////////////////
1579     /// @brief save/restore stack, providing ability to push/pop the stack and
1580     ///        reduce overall stack requirements for temporary stack use
STACKSAVE()1581     Value* Builder::STACKSAVE()
1582     {
1583         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1584         return CALLA(pfnStackSave);
1585     }
1586 
STACKRESTORE(Value * pSaved)1587     void Builder::STACKRESTORE(Value* pSaved)
1588     {
1589         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1590         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1591     }
1592 
FMADDPS(Value * a,Value * b,Value * c)1593     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1594     {
1595         Value* vOut;
1596         // use FMADs if available
1597         if(JM()->mArch.AVX2())
1598         {
1599             vOut = VFMADDPS(a, b, c);
1600         }
1601         else
1602         {
1603             vOut = FADD(FMUL(a, b), c);
1604         }
1605         return vOut;
1606     }
1607 
POPCNT(Value * a)1608     Value* Builder::POPCNT(Value* a)
1609     {
1610         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1611         return CALL(pCtPop, std::initializer_list<Value*>{a});
1612     }
1613 
1614     //////////////////////////////////////////////////////////////////////////
1615     /// @brief C functions called by LLVM IR
1616     //////////////////////////////////////////////////////////////////////////
1617 
1618     //////////////////////////////////////////////////////////////////////////
1619     /// @brief called in JIT code, inserted by PRINT
1620     /// output to both stdout and visual studio debug console
CallPrint(const char * fmt,...)1621     void __cdecl CallPrint(const char* fmt, ...)
1622     {
1623         va_list args;
1624         va_start(args, fmt);
1625         vprintf(fmt, args);
1626 
1627     #if defined( _WIN32 )
1628         char strBuf[1024];
1629         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1630         OutputDebugStringA(strBuf);
1631     #endif
1632 
1633         va_end(args);
1634     }
1635 
VEXTRACTI128(Value * a,Constant * imm8)1636     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1637     {
1638         bool flag = !imm8->isZeroValue();
1639         SmallVector<Constant*,8> idx;
1640         for (unsigned i = 0; i < mVWidth / 2; i++) {
1641             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1642         }
1643         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1644     }
1645 
VINSERTI128(Value * a,Value * b,Constant * imm8)1646     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1647     {
1648         bool flag = !imm8->isZeroValue();
1649         SmallVector<Constant*,8> idx;
1650         for (unsigned i = 0; i < mVWidth; i++) {
1651             idx.push_back(C(i));
1652         }
1653         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1654 
1655         SmallVector<Constant*,8> idx2;
1656         for (unsigned i = 0; i < mVWidth / 2; i++) {
1657             idx2.push_back(C(flag ? i : i + mVWidth));
1658         }
1659         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1660             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1661         }
1662         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1663     }
1664 
1665     // rdtsc buckets macros
RDTSC_START(Value * pBucketMgr,Value * pId)1666     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1667     {
1668         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1669         // buckets framework when single threaded
1670         if (KNOB_SINGLE_THREADED)
1671         {
1672             std::vector<Type*> args{
1673                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1674                 mInt32Ty                        // id
1675             };
1676 
1677             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1678             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1679             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1680             {
1681                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1682             }
1683 
1684             CALL(pFunc, { pBucketMgr, pId });
1685         }
1686     }
1687 
RDTSC_STOP(Value * pBucketMgr,Value * pId)1688     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1689     {
1690         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1691         // buckets framework when single threaded
1692         if (KNOB_SINGLE_THREADED)
1693         {
1694             std::vector<Type*> args{
1695                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1696                 mInt32Ty                        // id
1697             };
1698 
1699             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1700             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1701             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1702             {
1703                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1704             }
1705 
1706             CALL(pFunc, { pBucketMgr, pId });
1707         }
1708     }
1709 
1710 
GetTypeSize(Type * pType)1711     uint32_t Builder::GetTypeSize(Type* pType)
1712     {
1713         if (pType->isStructTy())
1714         {
1715             uint32_t numElems = pType->getStructNumElements();
1716             Type* pElemTy = pType->getStructElementType(0);
1717             return numElems * GetTypeSize(pElemTy);
1718         }
1719 
1720         if (pType->isArrayTy())
1721         {
1722             uint32_t numElems = pType->getArrayNumElements();
1723             Type* pElemTy = pType->getArrayElementType();
1724             return numElems * GetTypeSize(pElemTy);
1725         }
1726 
1727         if (pType->isIntegerTy())
1728         {
1729             uint32_t bitSize = pType->getIntegerBitWidth();
1730             return bitSize / 8;
1731         }
1732 
1733         if (pType->isFloatTy())
1734         {
1735             return 4;
1736         }
1737 
1738         if (pType->isHalfTy())
1739         {
1740             return 2;
1741         }
1742 
1743         if (pType->isDoubleTy())
1744         {
1745             return 8;
1746         }
1747 
1748         SWR_ASSERT(false, "Unimplemented type.");
1749         return 0;
1750     }
1751 }
1752