1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file builder_misc.cpp
24  *
25  * @brief Implementation for miscellaneous builder functions
26  *
27  * Notes:
28  *
29  ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 
33 #include <cstdarg>
34 
35 namespace SwrJit
36 {
AssertMemoryUsageParams(Value * ptr,MEM_CLIENT usage)37     void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage)
38     {
39         SWR_ASSERT(
40             ptr->getType() != mInt64Ty,
41             "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
42     }
43 
GEP(Value * Ptr,Value * Idx,Type * Ty,bool isReadOnly,const Twine & Name)44     Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name)
45     {
46         return IRB()->CreateGEP(Ptr, Idx, Name);
47     }
48 
GEP(Type * Ty,Value * Ptr,Value * Idx,const Twine & Name)49     Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
50     {
51         return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
52     }
53 
GEP(Value * ptr,const std::initializer_list<Value * > & indexList,Type * Ty)54     Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
55     {
56         std::vector<Value*> indices;
57         for (auto i : indexList)
58             indices.push_back(i);
59         return GEPA(ptr, indices);
60     }
61 
GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList,Type * Ty)62     Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
63     {
64         std::vector<Value*> indices;
65         for (auto i : indexList)
66             indices.push_back(C(i));
67         return GEPA(ptr, indices);
68     }
69 
GEPA(Value * Ptr,ArrayRef<Value * > IdxList,const Twine & Name)70     Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
71     {
72         return IRB()->CreateGEP(Ptr, IdxList, Name);
73     }
74 
GEPA(Type * Ty,Value * Ptr,ArrayRef<Value * > IdxList,const Twine & Name)75     Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
76     {
77         return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
78     }
79 
IN_BOUNDS_GEP(Value * ptr,const std::initializer_list<Value * > & indexList)80     Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
81     {
82         std::vector<Value*> indices;
83         for (auto i : indexList)
84             indices.push_back(i);
85         return IN_BOUNDS_GEP(ptr, indices);
86     }
87 
IN_BOUNDS_GEP(Value * ptr,const std::initializer_list<uint32_t> & indexList)88     Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
89     {
90         std::vector<Value*> indices;
91         for (auto i : indexList)
92             indices.push_back(C(i));
93         return IN_BOUNDS_GEP(ptr, indices);
94     }
95 
LOAD(Value * Ptr,const char * Name,Type * Ty,MEM_CLIENT usage)96     LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage)
97     {
98         AssertMemoryUsageParams(Ptr, usage);
99         return IRB()->CreateLoad(Ptr, Name);
100     }
101 
LOAD(Value * Ptr,const Twine & Name,Type * Ty,MEM_CLIENT usage)102     LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage)
103     {
104         AssertMemoryUsageParams(Ptr, usage);
105         return IRB()->CreateLoad(Ptr, Name);
106     }
107 
LOAD(Type * Ty,Value * Ptr,const Twine & Name,MEM_CLIENT usage)108     LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage)
109     {
110         AssertMemoryUsageParams(Ptr, usage);
111         return IRB()->CreateLoad(Ty, Ptr, Name);
112     }
113 
114     LoadInst*
LOAD(Value * Ptr,bool isVolatile,const Twine & Name,Type * Ty,MEM_CLIENT usage)115     Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage)
116     {
117         AssertMemoryUsageParams(Ptr, usage);
118         return IRB()->CreateLoad(Ptr, isVolatile, Name);
119     }
120 
LOAD(Value * basePtr,const std::initializer_list<uint32_t> & indices,const llvm::Twine & name,Type * Ty,MEM_CLIENT usage)121     LoadInst* Builder::LOAD(Value*                                 basePtr,
122                             const std::initializer_list<uint32_t>& indices,
123                             const llvm::Twine&                     name,
124                             Type*                                  Ty,
125                             MEM_CLIENT                             usage)
126     {
127         std::vector<Value*> valIndices;
128         for (auto i : indices)
129             valIndices.push_back(C(i));
130         return Builder::LOAD(GEPA(basePtr, valIndices), name);
131     }
132 
LOADV(Value * basePtr,const std::initializer_list<Value * > & indices,const llvm::Twine & name)133     LoadInst* Builder::LOADV(Value*                               basePtr,
134                              const std::initializer_list<Value*>& indices,
135                              const llvm::Twine&                   name)
136     {
137         std::vector<Value*> valIndices;
138         for (auto i : indices)
139             valIndices.push_back(i);
140         return LOAD(GEPA(basePtr, valIndices), name);
141     }
142 
143     StoreInst*
STORE(Value * val,Value * basePtr,const std::initializer_list<uint32_t> & indices,Type * Ty,MEM_CLIENT usage)144     Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage)
145     {
146         std::vector<Value*> valIndices;
147         for (auto i : indices)
148             valIndices.push_back(C(i));
149         return STORE(val, GEPA(basePtr, valIndices));
150     }
151 
152     StoreInst*
STOREV(Value * val,Value * basePtr,const std::initializer_list<Value * > & indices)153     Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
154     {
155         std::vector<Value*> valIndices;
156         for (auto i : indices)
157             valIndices.push_back(i);
158         return STORE(val, GEPA(basePtr, valIndices));
159     }
160 
OFFSET_TO_NEXT_COMPONENT(Value * base,Constant * offset)161     Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
162     {
163         return GEP(base, offset);
164     }
165 
MEM_ADD(Value * i32Incr,Value * basePtr,const std::initializer_list<uint32_t> & indices,const llvm::Twine & name)166     Value* Builder::MEM_ADD(Value*                                 i32Incr,
167                             Value*                                 basePtr,
168                             const std::initializer_list<uint32_t>& indices,
169                             const llvm::Twine&                     name)
170     {
171         Value* i32Value  = LOAD(GEP(basePtr, indices), name);
172         Value* i32Result = ADD(i32Value, i32Incr);
173         return STORE(i32Result, GEP(basePtr, indices));
174     }
175 
176     //////////////////////////////////////////////////////////////////////////
177     /// @brief Generate a masked gather operation in LLVM IR.  If not
178     /// supported on the underlying platform, emulate it with loads
179     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
180     /// @param pBase - Int8* base VB address pointer value
181     /// @param vIndices - SIMD wide value of VB byte offsets
182     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
183     /// @param scale - value to scale indices by
GATHERPS(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale,MEM_CLIENT usage)184     Value* Builder::GATHERPS(Value*         vSrc,
185                              Value*         pBase,
186                              Value*         vIndices,
187                              Value*         vMask,
188                              uint8_t        scale,
189                              MEM_CLIENT     usage)
190     {
191         AssertMemoryUsageParams(pBase, usage);
192 
193         return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
194     }
195 
196     //////////////////////////////////////////////////////////////////////////
197     /// @brief Generate a masked gather operation in LLVM IR.  If not
198     /// supported on the underlying platform, emulate it with loads
199     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
200     /// @param pBase - Int8* base VB address pointer value
201     /// @param vIndices - SIMD wide value of VB byte offsets
202     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
203     /// @param scale - value to scale indices by
GATHERDD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale,MEM_CLIENT usage)204     Value* Builder::GATHERDD(Value*         vSrc,
205                              Value*         pBase,
206                              Value*         vIndices,
207                              Value*         vMask,
208                              uint8_t        scale,
209                              MEM_CLIENT     usage)
210     {
211         AssertMemoryUsageParams(pBase, usage);
212 
213         return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
214     }
215 
216     //////////////////////////////////////////////////////////////////////////
217     /// @brief Generate a masked gather operation in LLVM IR.  If not
218     /// supported on the underlying platform, emulate it with loads
219     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
220     /// @param pBase - Int8* base VB address pointer value
221     /// @param vIndices - SIMD wide value of VB byte offsets
222     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
223     /// @param scale - value to scale indices by
224     Value*
GATHERPD(Value * vSrc,Value * pBase,Value * vIndices,Value * vMask,uint8_t scale)225     Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
226     {
227         return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
228     }
229 
230     //////////////////////////////////////////////////////////////////////////
231     /// @brief Alternative masked gather where source is a vector of pointers
232     /// @param pVecSrcPtr   - SIMD wide vector of pointers
233     /// @param pVecMask     - SIMD active lanes
234     /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
GATHER_PTR(Value * pVecSrcPtr,Value * pVecMask,Value * pVecPassthru)235     Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
236     {
237         return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru);
238     }
239 
SCATTER_PTR(Value * pVecDstPtr,Value * pVecSrc,Value * pVecMask)240     void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
241     {
242         MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask);
243     }
244 
Gather4(const SWR_FORMAT format,Value * pSrcBase,Value * byteOffsets,Value * mask,Value * vGatherComponents[],bool bPackedOutput,MEM_CLIENT usage)245     void Builder::Gather4(const SWR_FORMAT format,
246                           Value*           pSrcBase,
247                           Value*           byteOffsets,
248                           Value*           mask,
249                           Value*           vGatherComponents[],
250                           bool             bPackedOutput,
251                           MEM_CLIENT       usage)
252     {
253         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
254         if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
255         {
256             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
257         }
258         else
259         {
260             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
261         }
262     }
263 
GATHER4PS(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * vMask,Value * vGatherComponents[],bool bPackedOutput,MEM_CLIENT usage)264     void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
265                             Value*                 pSrcBase,
266                             Value*                 byteOffsets,
267                             Value*                 vMask,
268                             Value*                 vGatherComponents[],
269                             bool                   bPackedOutput,
270                             MEM_CLIENT             usage)
271     {
272         switch (info.bpp / info.numComps)
273         {
274         case 16:
275         {
276             Value* vGatherResult[2];
277 
278             // TODO: vGatherMaskedVal
279             Value* vGatherMaskedVal = VIMMED1((float)0);
280 
281             // always have at least one component out of x or y to fetch
282 
283             vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
284             // e.g. result of first 8x32bit integer gather for 16bit components
285             // 256i - 0    1    2    3    4    5    6    7
286             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
287             //
288 
289             // if we have at least one component out of x or y to fetch
290             if (info.numComps > 2)
291             {
292                 // offset base to the next components(zw) in the vertex to gather
293                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
294 
295                 vGatherResult[1] =
296                     GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
297                 // e.g. result of second 8x32bit integer gather for 16bit components
298                 // 256i - 0    1    2    3    4    5    6    7
299                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
300                 //
301             }
302             else
303             {
304                 vGatherResult[1] = vGatherMaskedVal;
305             }
306 
307             // Shuffle gathered components into place, each row is a component
308             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
309         }
310         break;
311         case 32:
312         {
313             // apply defaults
314             for (uint32_t i = 0; i < 4; ++i)
315             {
316                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
317             }
318 
319             for (uint32_t i = 0; i < info.numComps; i++)
320             {
321                 uint32_t swizzleIndex = info.swizzle[i];
322 
323                 // Gather a SIMD of components
324                 vGatherComponents[swizzleIndex] = GATHERPS(
325                     vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
326 
327                 // offset base to the next component to gather
328                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
329             }
330         }
331         break;
332         default:
333             SWR_INVALID("Invalid float format");
334             break;
335         }
336     }
337 
GATHER4DD(const SWR_FORMAT_INFO & info,Value * pSrcBase,Value * byteOffsets,Value * vMask,Value * vGatherComponents[],bool bPackedOutput,MEM_CLIENT usage)338     void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
339                             Value*                 pSrcBase,
340                             Value*                 byteOffsets,
341                             Value*                 vMask,
342                             Value*                 vGatherComponents[],
343                             bool                   bPackedOutput,
344                             MEM_CLIENT             usage)
345     {
346         switch (info.bpp / info.numComps)
347         {
348         case 8:
349         {
350             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
351             Value* vGatherResult =
352                 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
353             // e.g. result of an 8x32bit integer gather for 8bit components
354             // 256i - 0    1    2    3    4    5    6    7
355             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
356 
357             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
358         }
359         break;
360         case 16:
361         {
362             Value* vGatherResult[2];
363 
364             // TODO: vGatherMaskedVal
365             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
366 
367             // always have at least one component out of x or y to fetch
368 
369             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
370             // e.g. result of first 8x32bit integer gather for 16bit components
371             // 256i - 0    1    2    3    4    5    6    7
372             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
373             //
374 
375             // if we have at least one component out of x or y to fetch
376             if (info.numComps > 2)
377             {
378                 // offset base to the next components(zw) in the vertex to gather
379                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
380 
381                 vGatherResult[1] =
382                     GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
383                 // e.g. result of second 8x32bit integer gather for 16bit components
384                 // 256i - 0    1    2    3    4    5    6    7
385                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
386                 //
387             }
388             else
389             {
390                 vGatherResult[1] = vGatherMaskedVal;
391             }
392 
393             // Shuffle gathered components into place, each row is a component
394             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
395         }
396         break;
397         case 32:
398         {
399             // apply defaults
400             for (uint32_t i = 0; i < 4; ++i)
401             {
402                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
403             }
404 
405             for (uint32_t i = 0; i < info.numComps; i++)
406             {
407                 uint32_t swizzleIndex = info.swizzle[i];
408 
409                 // Gather a SIMD of components
410                 vGatherComponents[swizzleIndex] = GATHERDD(
411                     vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
412 
413                 // offset base to the next component to gather
414                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
415             }
416         }
417         break;
418         default:
419             SWR_INVALID("unsupported format");
420             break;
421         }
422     }
423 
Shuffle16bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput[2],Value * vGatherOutput[4],bool bPackedOutput)424     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
425                                       Value*                 vGatherInput[2],
426                                       Value*                 vGatherOutput[4],
427                                       bool                   bPackedOutput)
428     {
429         // cast types
430         Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
431         Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
432 
433         // input could either be float or int vector; do shuffle work in int
434         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
435         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
436 
437         if (bPackedOutput)
438         {
439             Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
440                                               mVWidth / 4); // vwidth is units of 32 bits
441 
442             // shuffle mask
443             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
444                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
445             Value* vShufResult =
446                 BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
447             // after pshufb: group components together in each 128bit lane
448             // 256i - 0    1    2    3    4    5    6    7
449             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
450 
451             Value* vi128XY =
452                 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
453             // after PERMD: move and pack xy components into each 128bit lane
454             // 256i - 0    1    2    3    4    5    6    7
455             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
456 
457             // do the same for zw components
458             Value* vi128ZW = nullptr;
459             if (info.numComps > 2)
460             {
461                 Value* vShufResult =
462                     BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
463                 vi128ZW =
464                     BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
465             }
466 
467             for (uint32_t i = 0; i < 4; i++)
468             {
469                 uint32_t swizzleIndex = info.swizzle[i];
470                 // todo: fixed for packed
471                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
472                 if (i >= info.numComps)
473                 {
474                     // set the default component val
475                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
476                     continue;
477                 }
478 
479                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
480                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
481                 // if x or y, use vi128XY permute result, else use vi128ZW
482                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
483 
484                 // extract packed component 128 bit lanes
485                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
486             }
487         }
488         else
489         {
490             // pshufb masks for each component
491             Value* vConstMask[2];
492             // x/z shuffle mask
493             vConstMask[0] = C<char>({
494                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
495                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
496             });
497 
498             // y/w shuffle mask
499             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
500                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
501 
502             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
503             // apply defaults
504             for (uint32_t i = 0; i < 4; ++i)
505             {
506                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
507             }
508 
509             for (uint32_t i = 0; i < info.numComps; i++)
510             {
511                 uint32_t swizzleIndex = info.swizzle[i];
512 
513                 // select correct constMask for x/z or y/w pshufb
514                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
515                 // if x or y, use vi128XY permute result, else use vi128ZW
516                 uint32_t selectedGather = (i < 2) ? 0 : 1;
517 
518                 vGatherOutput[swizzleIndex] =
519                     BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
520                                    vConstMask[selectedMask]),
521                             vGatherTy);
522                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
523                 // 256i - 0    1    2    3    4    5    6    7
524                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
525             }
526         }
527     }
528 
Shuffle8bpcGather4(const SWR_FORMAT_INFO & info,Value * vGatherInput,Value * vGatherOutput[],bool bPackedOutput)529     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
530                                      Value*                 vGatherInput,
531                                      Value*                 vGatherOutput[],
532                                      bool                   bPackedOutput)
533     {
534         // cast types
535         Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
536         Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
537 
538         if (bPackedOutput)
539         {
540             Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
541                                            mVWidth / 4); // vwidth is units of 32 bits
542                                                          // shuffle mask
543             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
544                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
545             Value* vShufResult =
546                 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
547             // after pshufb: group components together in each 128bit lane
548             // 256i - 0    1    2    3    4    5    6    7
549             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
550 
551             Value* vi128XY =
552                 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
553             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
554             // 256i - 0    1    2    3    4    5    6    7
555             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
556 
557             // do the same for zw components
558             Value* vi128ZW = nullptr;
559             if (info.numComps > 2)
560             {
561                 vi128ZW =
562                     BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
563             }
564 
565             // sign extend all enabled components. If we have a fill vVertexElements, output to
566             // current simdvertex
567             for (uint32_t i = 0; i < 4; i++)
568             {
569                 uint32_t swizzleIndex = info.swizzle[i];
570                 // todo: fix for packed
571                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
572                 if (i >= info.numComps)
573                 {
574                     // set the default component val
575                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
576                     continue;
577                 }
578 
579                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
580                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
581                 // if x or y, use vi128XY permute result, else use vi128ZW
582                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
583 
584                 // sign extend
585                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
586             }
587         }
588         // else zero extend
589         else
590         {
591             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
592             // apply defaults
593             for (uint32_t i = 0; i < 4; ++i)
594             {
595                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
596             }
597 
598             for (uint32_t i = 0; i < info.numComps; i++)
599             {
600                 uint32_t swizzleIndex = info.swizzle[i];
601 
602                 // pshufb masks for each component
603                 Value* vConstMask;
604                 switch (i)
605                 {
606                 case 0:
607                     // x shuffle mask
608                     vConstMask =
609                         C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
610                                  0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
611                     break;
612                 case 1:
613                     // y shuffle mask
614                     vConstMask =
615                         C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
616                                  1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
617                     break;
618                 case 2:
619                     // z shuffle mask
620                     vConstMask =
621                         C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
622                                  2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
623                     break;
624                 case 3:
625                     // w shuffle mask
626                     vConstMask =
627                         C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
628                                  3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
629                     break;
630                 default:
631                     vConstMask = nullptr;
632                     break;
633                 }
634 
635                 assert(vConstMask && "Invalid info.numComps value");
636                 vGatherOutput[swizzleIndex] =
637                     BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
638                 // after pshufb for x channel
639                 // 256i - 0    1    2    3    4    5    6    7
640                 //        x000 x000 x000 x000 x000 x000 x000 x000
641             }
642         }
643     }
644 
645     //////////////////////////////////////////////////////////////////////////
646     /// @brief emulates a scatter operation.
647     /// @param pDst - pointer to destination
648     /// @param vSrc - vector of src data to scatter
649     /// @param vOffsets - vector of byte offsets from pDst
650     /// @param vMask - mask of valid lanes
SCATTERPS(Value * pDst,Value * vSrc,Value * vOffsets,Value * vMask,MEM_CLIENT usage)651     void Builder::SCATTERPS(
652         Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage)
653     {
654         AssertMemoryUsageParams(pDst, usage);
655 #if LLVM_VERSION_MAJOR >= 11
656         SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy());
657 #else
658         SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
659 #endif
660         VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
661         return;
662 
663         /* Scatter algorithm
664 
665         while(Index = BitScanForward(mask))
666         srcElem = srcVector[Index]
667         offsetElem = offsetVector[Index]
668         *(pDst + offsetElem) = srcElem
669         Update mask (&= ~(1<<Index)
670 
671         */
672 
673         /*
674 
675         // Reference implementation kept around for reference
676 
677         BasicBlock* pCurBB = IRB()->GetInsertBlock();
678         Function*   pFunc  = pCurBB->getParent();
679         Type*       pSrcTy = vSrc->getType()->getVectorElementType();
680 
681         // Store vectors on stack
682         if (pScatterStackSrc == nullptr)
683         {
684             // Save off stack allocations and reuse per scatter. Significantly reduces stack
685             // requirements for shaders with a lot of scatters.
686             pScatterStackSrc     = CreateEntryAlloca(pFunc, mSimdInt64Ty);
687             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
688         }
689 
690         Value* pSrcArrayPtr     = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
691         Value* pOffsetsArrayPtr = pScatterStackOffsets;
692         STORE(vSrc, pSrcArrayPtr);
693         STORE(vOffsets, pOffsetsArrayPtr);
694 
695         // Cast to pointers for random access
696         pSrcArrayPtr     = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
697         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
698 
699         Value* pMask = VMOVMSK(vMask);
700 
701         // Setup loop basic block
702         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
703 
704         // compute first set bit
705         Value* pIndex = CTTZ(pMask, C(false));
706 
707         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
708 
709         // Split current block or create new one if building inline
710         BasicBlock* pPostLoop;
711         if (pCurBB->getTerminator())
712         {
713             pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
714 
715             // Remove unconditional jump created by splitBasicBlock
716             pCurBB->getTerminator()->eraseFromParent();
717 
718             // Add terminator to end of original block
719             IRB()->SetInsertPoint(pCurBB);
720 
721             // Add conditional branch
722             COND_BR(pIsUndef, pPostLoop, pLoop);
723         }
724         else
725         {
726             pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
727 
728             // Add conditional branch
729             COND_BR(pIsUndef, pPostLoop, pLoop);
730         }
731 
732         // Add loop basic block contents
733         IRB()->SetInsertPoint(pLoop);
734         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
735         PHINode* pMaskPhi  = PHI(mInt32Ty, 2);
736 
737         pIndexPhi->addIncoming(pIndex, pCurBB);
738         pMaskPhi->addIncoming(pMask, pCurBB);
739 
740         // Extract elements for this index
741         Value* pSrcElem    = LOADV(pSrcArrayPtr, {pIndexPhi});
742         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
743 
744         // GEP to this offset in dst
745         Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
746         pCurDst        = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
747         STORE(pSrcElem, pCurDst);
748 
749         // Update the mask
750         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
751 
752         // Terminator
753         Value* pNewIndex = CTTZ(pNewMask, C(false));
754 
755         pIsUndef = ICMP_EQ(pNewIndex, C(32));
756         COND_BR(pIsUndef, pPostLoop, pLoop);
757 
758         // Update phi edges
759         pIndexPhi->addIncoming(pNewIndex, pLoop);
760         pMaskPhi->addIncoming(pNewMask, pLoop);
761 
762         // Move builder to beginning of post loop
763         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
764 
765         */
766     }
767 } // namespace SwrJit
768