1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file builder_misc.cpp
24  *
25  * @brief Implementation for miscellaneous builder functions
26  *
27  * Notes:
28  *
29  ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "common/rdtsc_buckets.h"
33 
34 #include <cstdarg>
35 
36 extern "C" void CallPrint(const char* fmt, ...);
37 
38 namespace SwrJit
39 {
40     //////////////////////////////////////////////////////////////////////////
41     /// @brief Convert an IEEE 754 32-bit single precision float to an
42     ///        16 bit float with 5 exponent bits and a variable
43     ///        number of mantissa bits.
44     /// @param val - 32-bit float
45     /// @todo Maybe move this outside of this file into a header?
ConvertFloat32ToFloat16(float val)46     static uint16_t ConvertFloat32ToFloat16(float val)
47     {
48         uint32_t sign, exp, mant;
49         uint32_t roundBits;
50 
51         // Extract the sign, exponent, and mantissa
52         uint32_t uf = *(uint32_t*)&val;
53         sign        = (uf & 0x80000000) >> 31;
54         exp         = (uf & 0x7F800000) >> 23;
55         mant        = uf & 0x007FFFFF;
56 
57         // Check for out of range
58         if (std::isnan(val))
59         {
60             exp  = 0x1F;
61             mant = 0x200;
62             sign = 1; // set the sign bit for NANs
63         }
64         else if (std::isinf(val))
65         {
66             exp  = 0x1f;
67             mant = 0x0;
68         }
69         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
70         {
71             exp  = 0x1E;
72             mant = 0x3FF;
73         }
74         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
75         {
76             mant |= 0x00800000;
77             for (; exp <= 0x70; mant >>= 1, exp++)
78                 ;
79             exp  = 0;
80             mant = mant >> 13;
81         }
82         else if (exp < 0x66) // Too small to represent -> Zero
83         {
84             exp  = 0;
85             mant = 0;
86         }
87         else
88         {
89             // Saves bits that will be shifted off for rounding
90             roundBits = mant & 0x1FFFu;
91             // convert exponent and mantissa to 16 bit format
92             exp  = exp - 0x70;
93             mant = mant >> 13;
94 
95             // Essentially RTZ, but round up if off by only 1 lsb
96             if (roundBits == 0x1FFFu)
97             {
98                 mant++;
99                 // check for overflow
100                 if ((mant & 0xC00u) != 0)
101                     exp++;
102                 // make sure only the needed bits are used
103                 mant &= 0x3FF;
104             }
105         }
106 
107         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
108         return (uint16_t)tmpVal;
109     }
110 
C(bool i)111     Constant* Builder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); }
112 
C(char i)113     Constant* Builder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
114 
C(uint8_t i)115     Constant* Builder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
116 
C(int i)117     Constant* Builder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
118 
C(int64_t i)119     Constant* Builder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
120 
C(uint16_t i)121     Constant* Builder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); }
122 
C(uint32_t i)123     Constant* Builder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
124 
C(uint64_t i)125     Constant* Builder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
126 
C(float i)127     Constant* Builder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); }
128 
PRED(bool pred)129     Constant* Builder::PRED(bool pred)
130     {
131         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
132     }
133 
VIMMED1(uint64_t i)134     Value* Builder::VIMMED1(uint64_t i)
135     {
136 #if LLVM_VERSION_MAJOR <= 10
137         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
138 #elif LLVM_VERSION_MAJOR == 11
139         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
140 #else
141         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
142 #endif
143     }
144 
VIMMED1_16(uint64_t i)145     Value* Builder::VIMMED1_16(uint64_t i)
146     {
147 #if LLVM_VERSION_MAJOR <= 10
148         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
149 #elif LLVM_VERSION_MAJOR == 11
150         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
151 #else
152         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
153 #endif
154     }
155 
VIMMED1(int i)156     Value* Builder::VIMMED1(int i)
157     {
158 #if LLVM_VERSION_MAJOR <= 10
159         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
160 #elif LLVM_VERSION_MAJOR == 11
161         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
162 #else
163         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
164 #endif
165     }
166 
VIMMED1_16(int i)167     Value* Builder::VIMMED1_16(int i)
168     {
169 #if LLVM_VERSION_MAJOR <= 10
170         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
171 #elif LLVM_VERSION_MAJOR == 11
172         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
173 #else
174         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
175 #endif
176     }
177 
VIMMED1(uint32_t i)178     Value* Builder::VIMMED1(uint32_t i)
179     {
180 #if LLVM_VERSION_MAJOR <= 10
181         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
182 #elif LLVM_VERSION_MAJOR == 11
183         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
184 #else
185         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
186 #endif
187     }
188 
VIMMED1_16(uint32_t i)189     Value* Builder::VIMMED1_16(uint32_t i)
190     {
191 #if LLVM_VERSION_MAJOR <= 10
192         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
193 #elif LLVM_VERSION_MAJOR == 11
194         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
195 #else
196         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
197 #endif
198     }
199 
VIMMED1(float i)200     Value* Builder::VIMMED1(float i)
201     {
202 #if LLVM_VERSION_MAJOR <= 10
203         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
204 #elif LLVM_VERSION_MAJOR == 11
205         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantFP>(C(i)));
206 #else
207         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantFP>(C(i)));
208 #endif
209     }
210 
VIMMED1_16(float i)211     Value* Builder::VIMMED1_16(float i)
212     {
213 #if LLVM_VERSION_MAJOR <= 10
214         return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
215 #elif LLVM_VERSION_MAJOR == 11
216         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantFP>(C(i)));
217 #else
218         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantFP>(C(i)));
219 #endif
220     }
221 
VIMMED1(bool i)222     Value* Builder::VIMMED1(bool i)
223     {
224 #if LLVM_VERSION_MAJOR <= 10
225         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
226 #elif LLVM_VERSION_MAJOR == 11
227         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
228 #else
229         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
230 #endif
231     }
232 
VIMMED1_16(bool i)233     Value* Builder::VIMMED1_16(bool i)
234     {
235 #if LLVM_VERSION_MAJOR <= 10
236         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
237 #elif LLVM_VERSION_MAJOR == 11
238         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
239 #else
240         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
241 #endif
242     }
243 
VUNDEF_IPTR()244     Value* Builder::VUNDEF_IPTR() { return UndefValue::get(getVectorType(mInt32PtrTy, mVWidth)); }
245 
VUNDEF(Type * t)246     Value* Builder::VUNDEF(Type* t) { return UndefValue::get(getVectorType(t, mVWidth)); }
247 
VUNDEF_I()248     Value* Builder::VUNDEF_I() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth)); }
249 
VUNDEF_I_16()250     Value* Builder::VUNDEF_I_16() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth16)); }
251 
VUNDEF_F()252     Value* Builder::VUNDEF_F() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth)); }
253 
VUNDEF_F_16()254     Value* Builder::VUNDEF_F_16() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth16)); }
255 
VUNDEF(Type * ty,uint32_t size)256     Value* Builder::VUNDEF(Type* ty, uint32_t size)
257     {
258         return UndefValue::get(getVectorType(ty, size));
259     }
260 
VBROADCAST(Value * src,const llvm::Twine & name)261     Value* Builder::VBROADCAST(Value* src, const llvm::Twine& name)
262     {
263         // check if src is already a vector
264         if (src->getType()->isVectorTy())
265         {
266             return src;
267         }
268 
269         return VECTOR_SPLAT(mVWidth, src, name);
270     }
271 
VBROADCAST_16(Value * src)272     Value* Builder::VBROADCAST_16(Value* src)
273     {
274         // check if src is already a vector
275         if (src->getType()->isVectorTy())
276         {
277             return src;
278         }
279 
280         return VECTOR_SPLAT(mVWidth16, src);
281     }
282 
IMMED(Value * v)283     uint32_t Builder::IMMED(Value* v)
284     {
285         SWR_ASSERT(isa<ConstantInt>(v));
286         ConstantInt* pValConst = cast<ConstantInt>(v);
287         return pValConst->getZExtValue();
288     }
289 
S_IMMED(Value * v)290     int32_t Builder::S_IMMED(Value* v)
291     {
292         SWR_ASSERT(isa<ConstantInt>(v));
293         ConstantInt* pValConst = cast<ConstantInt>(v);
294         return pValConst->getSExtValue();
295     }
296 
CALL(Value * Callee,const std::initializer_list<Value * > & argsList,const llvm::Twine & name)297     CallInst* Builder::CALL(Value*                               Callee,
298                             const std::initializer_list<Value*>& argsList,
299                             const llvm::Twine&                   name)
300     {
301         std::vector<Value*> args;
302         for (auto arg : argsList)
303             args.push_back(arg);
304 #if LLVM_VERSION_MAJOR >= 11
305         // see comment to CALLA(Callee) function in the header
306         return CALLA(FunctionCallee(cast<Function>(Callee)), args, name);
307 #else
308         return CALLA(Callee, args, name);
309 #endif
310     }
311 
CALL(Value * Callee,Value * arg)312     CallInst* Builder::CALL(Value* Callee, Value* arg)
313     {
314         std::vector<Value*> args;
315         args.push_back(arg);
316 #if LLVM_VERSION_MAJOR >= 11
317         // see comment to CALLA(Callee) function in the header
318         return CALLA(FunctionCallee(cast<Function>(Callee)), args);
319 #else
320         return CALLA(Callee, args);
321 #endif
322     }
323 
CALL2(Value * Callee,Value * arg1,Value * arg2)324     CallInst* Builder::CALL2(Value* Callee, Value* arg1, Value* arg2)
325     {
326         std::vector<Value*> args;
327         args.push_back(arg1);
328         args.push_back(arg2);
329 #if LLVM_VERSION_MAJOR >= 11
330         // see comment to CALLA(Callee) function in the header
331         return CALLA(FunctionCallee(cast<Function>(Callee)), args);
332 #else
333         return CALLA(Callee, args);
334 #endif
335     }
336 
CALL3(Value * Callee,Value * arg1,Value * arg2,Value * arg3)337     CallInst* Builder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3)
338     {
339         std::vector<Value*> args;
340         args.push_back(arg1);
341         args.push_back(arg2);
342         args.push_back(arg3);
343 #if LLVM_VERSION_MAJOR >= 11
344         // see comment to CALLA(Callee) function in the header
345         return CALLA(FunctionCallee(cast<Function>(Callee)), args);
346 #else
347         return CALLA(Callee, args);
348 #endif
349     }
350 
VRCP(Value * va,const llvm::Twine & name)351     Value* Builder::VRCP(Value* va, const llvm::Twine& name)
352     {
353         return FDIV(VIMMED1(1.0f), va, name); // 1 / a
354     }
355 
VPLANEPS(Value * vA,Value * vB,Value * vC,Value * & vX,Value * & vY)356     Value* Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY)
357     {
358         Value* vOut = FMADDPS(vA, vX, vC);
359         vOut        = FMADDPS(vB, vY, vOut);
360         return vOut;
361     }
362 
363     //////////////////////////////////////////////////////////////////////////
364     /// @brief insert a JIT call to CallPrint
365     /// - outputs formatted string to both stdout and VS output window
366     /// - DEBUG builds only
367     /// Usage example:
368     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
369     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
370     ///   result from a GEP, printing out the pointer to memory
371     /// @param printStr - constant string to print, which includes format specifiers
372     /// @param printArgs - initializer list of Value*'s to print to std out
PRINT(const std::string & printStr,const std::initializer_list<Value * > & printArgs)373     CallInst* Builder::PRINT(const std::string&                   printStr,
374                              const std::initializer_list<Value*>& printArgs)
375     {
376         // push the arguments to CallPrint into a vector
377         std::vector<Value*> printCallArgs;
378         // save room for the format string.  we still need to modify it for vectors
379         printCallArgs.resize(1);
380 
381         // search through the format string for special processing
382         size_t      pos = 0;
383         std::string tempStr(printStr);
384         pos    = tempStr.find('%', pos);
385         auto v = printArgs.begin();
386 
387         while ((pos != std::string::npos) && (v != printArgs.end()))
388         {
389             Value* pArg  = *v;
390             Type*  pType = pArg->getType();
391 
392             if (pType->isVectorTy())
393             {
394                 Type* pContainedType = pType->getContainedType(0);
395 #if LLVM_VERSION_MAJOR >= 11
396                 VectorType* pVectorType = cast<VectorType>(pType);
397 #endif
398                 if (toupper(tempStr[pos + 1]) == 'X')
399                 {
400                     tempStr[pos]     = '0';
401                     tempStr[pos + 1] = 'x';
402                     tempStr.insert(pos + 2, "%08X ");
403                     pos += 7;
404 
405                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
406 
407                     std::string vectorFormatStr;
408 #if LLVM_VERSION_MAJOR >= 11
409                     for (uint32_t i = 1; i < pVectorType->getNumElements(); ++i)
410 #else
411                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
412 #endif
413                     {
414                         vectorFormatStr += "0x%08X ";
415                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
416                     }
417 
418                     tempStr.insert(pos, vectorFormatStr);
419                     pos += vectorFormatStr.size();
420                 }
421                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
422                 {
423                     uint32_t i = 0;
424 #if LLVM_VERSION_MAJOR >= 11
425                     for (; i < pVectorType->getNumElements() - 1; i++)
426 #else
427                     for (; i < pType->getVectorNumElements() - 1; i++)
428 #endif
429                     {
430                         tempStr.insert(pos, std::string("%f "));
431                         pos += 3;
432                         printCallArgs.push_back(
433                             FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
434                     }
435                     printCallArgs.push_back(
436                         FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
437                 }
438                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
439                 {
440                     uint32_t i = 0;
441 #if LLVM_VERSION_MAJOR >= 11
442                     for (; i < pVectorType->getNumElements() - 1; i++)
443 #else
444                     for (; i < pType->getVectorNumElements() - 1; i++)
445 #endif
446                     {
447                         tempStr.insert(pos, std::string("%d "));
448                         pos += 3;
449                         printCallArgs.push_back(
450                             S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
451                     }
452                     printCallArgs.push_back(
453                         S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
454                 }
455                 else if ((tempStr[pos + 1] == 'u') && (pContainedType->isIntegerTy()))
456                 {
457                     uint32_t i = 0;
458 #if LLVM_VERSION_MAJOR >= 11
459                     for (; i < pVectorType->getNumElements() - 1; i++)
460 #else
461                     for (; i < pType->getVectorNumElements() - 1; i++)
462 #endif
463                     {
464                         tempStr.insert(pos, std::string("%d "));
465                         pos += 3;
466                         printCallArgs.push_back(
467                             Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
468                     }
469                     printCallArgs.push_back(
470                         Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
471                 }
472             }
473             else
474             {
475                 if (toupper(tempStr[pos + 1]) == 'X')
476                 {
477                     tempStr[pos] = '0';
478                     tempStr.insert(pos + 1, "x%08");
479                     printCallArgs.push_back(pArg);
480                     pos += 3;
481                 }
482                 // for %f we need to cast float Values to doubles so that they print out correctly
483                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
484                 {
485                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
486                     pos++;
487                 }
488                 else
489                 {
490                     printCallArgs.push_back(pArg);
491                 }
492             }
493 
494             // advance to the next arguement
495             v++;
496             pos = tempStr.find('%', ++pos);
497         }
498 
499         // create global variable constant string
500         Constant*       constString = ConstantDataArray::getString(JM()->mContext, tempStr, true);
501         GlobalVariable* gvPtr       = new GlobalVariable(
502             constString->getType(), true, GlobalValue::InternalLinkage, constString, "printStr");
503         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
504 
505         // get a pointer to the first character in the constant string array
506         std::vector<Constant*> geplist{C(0), C(0)};
507         Constant* strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr, geplist, false);
508 
509         // insert the pointer to the format string in the argument vector
510         printCallArgs[0] = strGEP;
511 
512         // get pointer to CallPrint function and insert decl into the module if needed
513         std::vector<Type*> args;
514         args.push_back(PointerType::get(mInt8Ty, 0));
515         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, true);
516         Function*     callPrintFn =
517 #if LLVM_VERSION_MAJOR >= 9
518             cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy).getCallee());
519 #else
520             cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
521 #endif
522 
523         // if we haven't yet added the symbol to the symbol table
524         if ((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
525         {
526             sys::DynamicLibrary::AddSymbol("CallPrint", (void*)&CallPrint);
527         }
528 
529         // insert a call to CallPrint
530         return CALLA(callPrintFn, printCallArgs);
531     }
532 
533     //////////////////////////////////////////////////////////////////////////
534     /// @brief Wrapper around PRINT with initializer list.
PRINT(const std::string & printStr)535     CallInst* Builder::PRINT(const std::string& printStr) { return PRINT(printStr, {}); }
536 
EXTRACT_16(Value * x,uint32_t imm)537     Value* Builder::EXTRACT_16(Value* x, uint32_t imm)
538     {
539         if (imm == 0)
540         {
541             return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7});
542         }
543         else
544         {
545             return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15});
546         }
547     }
548 
JOIN_16(Value * a,Value * b)549     Value* Builder::JOIN_16(Value* a, Value* b)
550     {
551         return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
552     }
553 
554     //////////////////////////////////////////////////////////////////////////
555     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
MASK(Value * vmask)556     Value* Builder::MASK(Value* vmask)
557     {
558         Value* src = BITCAST(vmask, mSimdInt32Ty);
559         return ICMP_SLT(src, VIMMED1(0));
560     }
561 
MASK_16(Value * vmask)562     Value* Builder::MASK_16(Value* vmask)
563     {
564         Value* src = BITCAST(vmask, mSimd16Int32Ty);
565         return ICMP_SLT(src, VIMMED1_16(0));
566     }
567 
568     //////////////////////////////////////////////////////////////////////////
569     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
VMASK(Value * mask)570     Value* Builder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); }
571 
VMASK_16(Value * mask)572     Value* Builder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); }
573 
574     /// @brief Convert <Nxi1> llvm mask to integer
VMOVMSK(Value * mask)575     Value* Builder::VMOVMSK(Value* mask)
576     {
577 #if LLVM_VERSION_MAJOR >= 11
578         VectorType* pVectorType = cast<VectorType>(mask->getType());
579         SWR_ASSERT(pVectorType->getElementType() == mInt1Ty);
580         uint32_t numLanes = pVectorType->getNumElements();
581 #else
582         SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty);
583         uint32_t numLanes = mask->getType()->getVectorNumElements();
584 #endif
585         Value*   i32Result;
586         if (numLanes == 8)
587         {
588             i32Result = BITCAST(mask, mInt8Ty);
589         }
590         else if (numLanes == 16)
591         {
592             i32Result = BITCAST(mask, mInt16Ty);
593         }
594         else
595         {
596             SWR_ASSERT("Unsupported vector width");
597             i32Result = BITCAST(mask, mInt8Ty);
598         }
599         return Z_EXT(i32Result, mInt32Ty);
600     }
601 
602     //////////////////////////////////////////////////////////////////////////
603     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
604     /// supported on the underlying platform, emulate it
605     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
606     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
607     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
608     /// 128bits of a, and vice versa for the upper lanes.  If the mask
609     /// value is negative, '0' is inserted.
PSHUFB(Value * a,Value * b)610     Value* Builder::PSHUFB(Value* a, Value* b)
611     {
612         Value* res;
613         // use avx2 pshufb instruction if available
614         if (JM()->mArch.AVX2())
615         {
616             res = VPSHUFB(a, b);
617         }
618         else
619         {
620             Constant* cB = dyn_cast<Constant>(b);
621             assert(cB != nullptr);
622             // number of 8 bit elements in b
623             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
624             // output vector
625             Value* vShuf = UndefValue::get(getVectorType(mInt8Ty, numElms));
626 
627             // insert an 8 bit value from the high and low lanes of a per loop iteration
628             numElms /= 2;
629             for (uint32_t i = 0; i < numElms; i++)
630             {
631                 ConstantInt* cLow128b  = cast<ConstantInt>(cB->getAggregateElement(i));
632                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
633 
634                 // extract values from constant mask
635                 char valLow128bLane  = (char)(cLow128b->getSExtValue());
636                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
637 
638                 Value* insertValLow128b;
639                 Value* insertValHigh128b;
640 
641                 // if the mask value is negative, insert a '0' in the respective output position
642                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask
643                 // byte) in a and insert in output vector
644                 insertValLow128b =
645                     (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
646                 insertValHigh128b = (valHigh128bLane < 0)
647                                         ? C((char)0)
648                                         : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
649 
650                 vShuf = VINSERT(vShuf, insertValLow128b, i);
651                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
652             }
653             res = vShuf;
654         }
655         return res;
656     }
657 
658     //////////////////////////////////////////////////////////////////////////
659     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
660     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
661     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
662     /// lower 8 values are used.
PMOVSXBD(Value * a)663     Value* Builder::PMOVSXBD(Value* a)
664     {
665         // VPMOVSXBD output type
666         Type* v8x32Ty = getVectorType(mInt32Ty, 8);
667         // Extract 8 values from 128bit lane and sign extend
668         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
669     }
670 
671     //////////////////////////////////////////////////////////////////////////
672     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
673     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
674     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
PMOVSXWD(Value * a)675     Value* Builder::PMOVSXWD(Value* a)
676     {
677         // VPMOVSXWD output type
678         Type* v8x32Ty = getVectorType(mInt32Ty, 8);
679         // Extract 8 values from 128bit lane and sign extend
680         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
681     }
682 
683     //////////////////////////////////////////////////////////////////////////
684     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
685     /// in LLVM IR.  If not supported on the underlying platform, emulate it
686     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
CVTPH2PS(Value * a,const llvm::Twine & name)687     Value* Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
688     {
689         // Bitcast Nxint16 to Nxhalf
690 #if LLVM_VERSION_MAJOR >= 11
691         uint32_t numElems = cast<VectorType>(a->getType())->getNumElements();
692 #else
693         uint32_t numElems = a->getType()->getVectorNumElements();
694 #endif
695         Value*   input    = BITCAST(a, getVectorType(mFP16Ty, numElems));
696 
697         return FP_EXT(input, getVectorType(mFP32Ty, numElems), name);
698     }
699 
700     //////////////////////////////////////////////////////////////////////////
701     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
702     /// in LLVM IR.  If not supported on the underlying platform, emulate it
703     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
CVTPS2PH(Value * a,Value * rounding)704     Value* Builder::CVTPS2PH(Value* a, Value* rounding)
705     {
706         if (JM()->mArch.F16C())
707         {
708             return VCVTPS2PH(a, rounding);
709         }
710         else
711         {
712             // call scalar C function for now
713             FunctionType* pFuncTy   = FunctionType::get(mInt16Ty, mFP32Ty);
714             Function*     pCvtPs2Ph = cast<Function>(
715 #if LLVM_VERSION_MAJOR >= 9
716                 JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy).getCallee());
717 #else
718                 JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
719 #endif
720 
721             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
722             {
723                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16",
724                                                (void*)&ConvertFloat32ToFloat16);
725             }
726 
727             Value* pResult = UndefValue::get(mSimdInt16Ty);
728             for (uint32_t i = 0; i < mVWidth; ++i)
729             {
730                 Value* pSrc  = VEXTRACT(a, C(i));
731                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
732                 pResult      = VINSERT(pResult, pConv, C(i));
733             }
734 
735             return pResult;
736         }
737     }
738 
PMAXSD(Value * a,Value * b)739     Value* Builder::PMAXSD(Value* a, Value* b)
740     {
741         Value* cmp = ICMP_SGT(a, b);
742         return SELECT(cmp, a, b);
743     }
744 
PMINSD(Value * a,Value * b)745     Value* Builder::PMINSD(Value* a, Value* b)
746     {
747         Value* cmp = ICMP_SLT(a, b);
748         return SELECT(cmp, a, b);
749     }
750 
PMAXUD(Value * a,Value * b)751     Value* Builder::PMAXUD(Value* a, Value* b)
752     {
753         Value* cmp = ICMP_UGT(a, b);
754         return SELECT(cmp, a, b);
755     }
756 
PMINUD(Value * a,Value * b)757     Value* Builder::PMINUD(Value* a, Value* b)
758     {
759         Value* cmp = ICMP_ULT(a, b);
760         return SELECT(cmp, a, b);
761     }
762 
763     // Helper function to create alloca in entry block of function
CreateEntryAlloca(Function * pFunc,Type * pType)764     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
765     {
766         auto saveIP = IRB()->saveIP();
767         IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
768         Value* pAlloca = ALLOCA(pType);
769         if (saveIP.isSet())
770             IRB()->restoreIP(saveIP);
771         return pAlloca;
772     }
773 
CreateEntryAlloca(Function * pFunc,Type * pType,Value * pArraySize)774     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
775     {
776         auto saveIP = IRB()->saveIP();
777         IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
778         Value* pAlloca = ALLOCA(pType, pArraySize);
779         if (saveIP.isSet())
780             IRB()->restoreIP(saveIP);
781         return pAlloca;
782     }
783 
VABSPS(Value * a)784     Value* Builder::VABSPS(Value* a)
785     {
786         Value* asInt  = BITCAST(a, mSimdInt32Ty);
787         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
788         return result;
789     }
790 
ICLAMP(Value * src,Value * low,Value * high,const llvm::Twine & name)791     Value* Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
792     {
793         Value* lowCmp = ICMP_SLT(src, low);
794         Value* ret    = SELECT(lowCmp, low, src);
795 
796         Value* highCmp = ICMP_SGT(ret, high);
797         ret            = SELECT(highCmp, high, ret, name);
798 
799         return ret;
800     }
801 
FCLAMP(Value * src,Value * low,Value * high)802     Value* Builder::FCLAMP(Value* src, Value* low, Value* high)
803     {
804         Value* lowCmp = FCMP_OLT(src, low);
805         Value* ret    = SELECT(lowCmp, low, src);
806 
807         Value* highCmp = FCMP_OGT(ret, high);
808         ret            = SELECT(highCmp, high, ret);
809 
810         return ret;
811     }
812 
FCLAMP(Value * src,float low,float high)813     Value* Builder::FCLAMP(Value* src, float low, float high)
814     {
815         Value* result = VMAXPS(src, VIMMED1(low));
816         result        = VMINPS(result, VIMMED1(high));
817 
818         return result;
819     }
820 
FMADDPS(Value * a,Value * b,Value * c)821     Value* Builder::FMADDPS(Value* a, Value* b, Value* c)
822     {
823         Value* vOut;
824         // This maps to LLVM fmuladd intrinsic
825         vOut = VFMADDPS(a, b, c);
826         return vOut;
827     }
828 
829     //////////////////////////////////////////////////////////////////////////
830     /// @brief pop count on vector mask (e.g. <8 x i1>)
VPOPCNT(Value * a)831     Value* Builder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); }
832 
833     //////////////////////////////////////////////////////////////////////////
834     /// @brief Float / Fixed-point conversions
835     //////////////////////////////////////////////////////////////////////////
VCVT_F32_FIXED_SI(Value * vFloat,uint32_t numIntBits,uint32_t numFracBits,const llvm::Twine & name)836     Value* Builder::VCVT_F32_FIXED_SI(Value*             vFloat,
837                                       uint32_t           numIntBits,
838                                       uint32_t           numFracBits,
839                                       const llvm::Twine& name)
840     {
841         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
842         Value* fixed = nullptr;
843 
844 #if 0   // This doesn't work for negative numbers!!
845         {
846             fixed = FP_TO_SI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
847                                     C(_MM_FROUND_TO_NEAREST_INT)),
848                              mSimdInt32Ty);
849         }
850         else
851 #endif
852         {
853             // Do round to nearest int on fractional bits first
854             // Not entirely perfect for negative numbers, but close enough
855             vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
856                             C(_MM_FROUND_TO_NEAREST_INT));
857             vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
858 
859             // TODO: Handle INF, NAN, overflow / underflow, etc.
860 
861             Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
862             Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
863             Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
864             vFixed           = OR(vFixed, VIMMED1(1 << 23));
865             vFixed           = SELECT(vSgn, NEG(vFixed), vFixed);
866 
867             Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
868             vExp        = SUB(vExp, VIMMED1(127));
869 
870             Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
871 
872             fixed = ASHR(vFixed, vExtraBits, name);
873         }
874 
875         return fixed;
876     }
877 
VCVT_FIXED_SI_F32(Value * vFixed,uint32_t numIntBits,uint32_t numFracBits,const llvm::Twine & name)878     Value* Builder::VCVT_FIXED_SI_F32(Value*             vFixed,
879                                       uint32_t           numIntBits,
880                                       uint32_t           numFracBits,
881                                       const llvm::Twine& name)
882     {
883         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
884         uint32_t extraBits = 32 - numIntBits - numFracBits;
885         if (numIntBits && extraBits)
886         {
887             // Sign extend
888             Value* shftAmt = VIMMED1(extraBits);
889             vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
890         }
891 
892         Value* fVal  = VIMMED1(0.0f);
893         Value* fFrac = VIMMED1(0.0f);
894         if (numIntBits)
895         {
896             fVal = SI_TO_FP(ASHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
897         }
898 
899         if (numFracBits)
900         {
901             fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
902             fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
903         }
904 
905         return FADD(fVal, fFrac, name);
906     }
907 
VCVT_F32_FIXED_UI(Value * vFloat,uint32_t numIntBits,uint32_t numFracBits,const llvm::Twine & name)908     Value* Builder::VCVT_F32_FIXED_UI(Value*             vFloat,
909                                       uint32_t           numIntBits,
910                                       uint32_t           numFracBits,
911                                       const llvm::Twine& name)
912     {
913         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
914         Value* fixed = nullptr;
915 #if 1   // KNOB_SIM_FAST_MATH?  Below works correctly from a precision
916         // standpoint...
917         {
918             fixed = FP_TO_UI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
919                                     C(_MM_FROUND_TO_NEAREST_INT)),
920                              mSimdInt32Ty);
921         }
922 #else
923         {
924             // Do round to nearest int on fractional bits first
925             vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
926                             C(_MM_FROUND_TO_NEAREST_INT));
927             vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
928 
929             // TODO: Handle INF, NAN, overflow / underflow, etc.
930 
931             Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
932             Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
933             Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
934             vFixed           = OR(vFixed, VIMMED1(1 << 23));
935 
936             Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
937             vExp        = SUB(vExp, VIMMED1(127));
938 
939             Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
940 
941             fixed = LSHR(vFixed, vExtraBits, name);
942         }
943 #endif
944         return fixed;
945     }
946 
VCVT_FIXED_UI_F32(Value * vFixed,uint32_t numIntBits,uint32_t numFracBits,const llvm::Twine & name)947     Value* Builder::VCVT_FIXED_UI_F32(Value*             vFixed,
948                                       uint32_t           numIntBits,
949                                       uint32_t           numFracBits,
950                                       const llvm::Twine& name)
951     {
952         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
953         uint32_t extraBits = 32 - numIntBits - numFracBits;
954         if (numIntBits && extraBits)
955         {
956             // Sign extend
957             Value* shftAmt = VIMMED1(extraBits);
958             vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
959         }
960 
961         Value* fVal  = VIMMED1(0.0f);
962         Value* fFrac = VIMMED1(0.0f);
963         if (numIntBits)
964         {
965             fVal = UI_TO_FP(LSHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
966         }
967 
968         if (numFracBits)
969         {
970             fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
971             fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
972         }
973 
974         return FADD(fVal, fFrac, name);
975     }
976 
977     //////////////////////////////////////////////////////////////////////////
978     /// @brief C functions called by LLVM IR
979     //////////////////////////////////////////////////////////////////////////
980 
VEXTRACTI128(Value * a,Constant * imm8)981     Value* Builder::VEXTRACTI128(Value* a, Constant* imm8)
982     {
983         bool                      flag = !imm8->isZeroValue();
984         SmallVector<Constant*, 8> idx;
985         for (unsigned i = 0; i < mVWidth / 2; i++)
986         {
987             idx.push_back(C(flag ? i + mVWidth / 2 : i));
988         }
989         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
990     }
991 
VINSERTI128(Value * a,Value * b,Constant * imm8)992     Value* Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
993     {
994         bool                      flag = !imm8->isZeroValue();
995         SmallVector<Constant*, 8> idx;
996         for (unsigned i = 0; i < mVWidth; i++)
997         {
998             idx.push_back(C(i));
999         }
1000         Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1001 
1002         SmallVector<Constant*, 8> idx2;
1003         for (unsigned i = 0; i < mVWidth / 2; i++)
1004         {
1005             idx2.push_back(C(flag ? i : i + mVWidth));
1006         }
1007         for (unsigned i = mVWidth / 2; i < mVWidth; i++)
1008         {
1009             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1010         }
1011         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1012     }
1013 
1014     // rdtsc buckets macros
RDTSC_START(Value * pBucketMgr,Value * pId)1015     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1016     {
1017         // @todo due to an issue with thread local storage propagation in llvm, we can only safely
1018         // call into buckets framework when single threaded
1019         if (KNOB_SINGLE_THREADED)
1020         {
1021             std::vector<Type*> args{
1022                 PointerType::get(mInt32Ty, 0), // pBucketMgr
1023                 mInt32Ty                       // id
1024             };
1025 
1026             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1027             Function*     pFunc   = cast<Function>(
1028 #if LLVM_VERSION_MAJOR >= 9
1029                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy).getCallee());
1030 #else
1031                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1032 #endif
1033             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") ==
1034                 nullptr)
1035             {
1036                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket",
1037                                                (void*)&BucketManager_StartBucket);
1038             }
1039 
1040             CALL(pFunc, {pBucketMgr, pId});
1041         }
1042     }
1043 
RDTSC_STOP(Value * pBucketMgr,Value * pId)1044     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1045     {
1046         // @todo due to an issue with thread local storage propagation in llvm, we can only safely
1047         // call into buckets framework when single threaded
1048         if (KNOB_SINGLE_THREADED)
1049         {
1050             std::vector<Type*> args{
1051                 PointerType::get(mInt32Ty, 0), // pBucketMgr
1052                 mInt32Ty                       // id
1053             };
1054 
1055             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1056             Function*     pFunc   = cast<Function>(
1057 #if LLVM_VERSION_MAJOR >= 9
1058                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy).getCallee());
1059 #else
1060                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1061 #endif
1062             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") ==
1063                 nullptr)
1064             {
1065                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket",
1066                                                (void*)&BucketManager_StopBucket);
1067             }
1068 
1069             CALL(pFunc, {pBucketMgr, pId});
1070         }
1071     }
1072 
GetTypeSize(Type * pType)1073     uint32_t Builder::GetTypeSize(Type* pType)
1074     {
1075         if (pType->isStructTy())
1076         {
1077             uint32_t numElems = pType->getStructNumElements();
1078             Type*    pElemTy  = pType->getStructElementType(0);
1079             return numElems * GetTypeSize(pElemTy);
1080         }
1081 
1082         if (pType->isArrayTy())
1083         {
1084             uint32_t numElems = pType->getArrayNumElements();
1085             Type*    pElemTy  = pType->getArrayElementType();
1086             return numElems * GetTypeSize(pElemTy);
1087         }
1088 
1089         if (pType->isIntegerTy())
1090         {
1091             uint32_t bitSize = pType->getIntegerBitWidth();
1092             return bitSize / 8;
1093         }
1094 
1095         if (pType->isFloatTy())
1096         {
1097             return 4;
1098         }
1099 
1100         if (pType->isHalfTy())
1101         {
1102             return 2;
1103         }
1104 
1105         if (pType->isDoubleTy())
1106         {
1107             return 8;
1108         }
1109 
1110         SWR_ASSERT(false, "Unimplemented type.");
1111         return 0;
1112     }
1113 } // namespace SwrJit
1114