1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /**
29  * @file
30  * Helper functions for swizzling/shuffling.
31  *
32  * @author Jose Fonseca <jfonseca@vmware.com>
33  */
34 
35 #include <inttypes.h>  /* for PRIx64 macro */
36 #include "util/u_debug.h"
37 
38 #include "lp_bld_type.h"
39 #include "lp_bld_const.h"
40 #include "lp_bld_init.h"
41 #include "lp_bld_logic.h"
42 #include "lp_bld_swizzle.h"
43 #include "lp_bld_pack.h"
44 
45 
46 LLVMValueRef
lp_build_broadcast(struct gallivm_state * gallivm,LLVMTypeRef vec_type,LLVMValueRef scalar)47 lp_build_broadcast(struct gallivm_state *gallivm,
48                    LLVMTypeRef vec_type,
49                    LLVMValueRef scalar)
50 {
51    LLVMValueRef res;
52 
53    if (LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind) {
54       /* scalar */
55       assert(vec_type == LLVMTypeOf(scalar));
56       res = scalar;
57    } else {
58       LLVMBuilderRef builder = gallivm->builder;
59       const unsigned length = LLVMGetVectorSize(vec_type);
60       LLVMValueRef undef = LLVMGetUndef(vec_type);
61       /* The shuffle vector is always made of int32 elements */
62       LLVMTypeRef i32_type = LLVMInt32TypeInContext(gallivm->context);
63       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
64 
65       assert(LLVMGetElementType(vec_type) == LLVMTypeOf(scalar));
66 
67       res = LLVMBuildInsertElement(builder, undef, scalar, LLVMConstNull(i32_type), "");
68       res = LLVMBuildShuffleVector(builder, res, undef, LLVMConstNull(i32_vec_type), "");
69    }
70 
71    return res;
72 }
73 
74 
75 /**
76  * Broadcast
77  */
78 LLVMValueRef
lp_build_broadcast_scalar(struct lp_build_context * bld,LLVMValueRef scalar)79 lp_build_broadcast_scalar(struct lp_build_context *bld,
80                           LLVMValueRef scalar)
81 {
82    assert(lp_check_elem_type(bld->type, LLVMTypeOf(scalar)));
83 
84    return lp_build_broadcast(bld->gallivm, bld->vec_type, scalar);
85 }
86 
87 
88 /**
89  * Combined extract and broadcast (mere shuffle in most cases)
90  */
91 LLVMValueRef
lp_build_extract_broadcast(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef vector,LLVMValueRef index)92 lp_build_extract_broadcast(struct gallivm_state *gallivm,
93                            struct lp_type src_type,
94                            struct lp_type dst_type,
95                            LLVMValueRef vector,
96                            LLVMValueRef index)
97 {
98    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
99    LLVMValueRef res;
100 
101    assert(src_type.floating == dst_type.floating);
102    assert(src_type.width    == dst_type.width);
103 
104    assert(lp_check_value(src_type, vector));
105    assert(LLVMTypeOf(index) == i32t);
106 
107    if (src_type.length == 1) {
108       if (dst_type.length == 1) {
109          /*
110           * Trivial scalar -> scalar.
111           */
112 
113          res = vector;
114       }
115       else {
116          /*
117           * Broadcast scalar -> vector.
118           */
119 
120          res = lp_build_broadcast(gallivm,
121                                   lp_build_vec_type(gallivm, dst_type),
122                                   vector);
123       }
124    }
125    else {
126       if (dst_type.length > 1) {
127          /*
128           * shuffle - result can be of different length.
129           */
130 
131          LLVMValueRef shuffle;
132          shuffle = lp_build_broadcast(gallivm,
133                                       LLVMVectorType(i32t, dst_type.length),
134                                       index);
135          res = LLVMBuildShuffleVector(gallivm->builder, vector,
136                                       LLVMGetUndef(lp_build_vec_type(gallivm, src_type)),
137                                       shuffle, "");
138       }
139       else {
140          /*
141           * Trivial extract scalar from vector.
142           */
143           res = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
144       }
145    }
146 
147    return res;
148 }
149 
150 
151 /**
152  * Swizzle one channel into other channels.
153  */
154 LLVMValueRef
lp_build_swizzle_scalar_aos(struct lp_build_context * bld,LLVMValueRef a,unsigned channel,unsigned num_channels)155 lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
156                             LLVMValueRef a,
157                             unsigned channel,
158                             unsigned num_channels)
159 {
160    LLVMBuilderRef builder = bld->gallivm->builder;
161    const struct lp_type type = bld->type;
162    const unsigned n = type.length;
163    unsigned i, j;
164 
165    if(a == bld->undef || a == bld->zero || a == bld->one || num_channels == 1)
166       return a;
167 
168    assert(num_channels == 2 || num_channels == 4);
169 
170    /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
171     * using shuffles here actually causes worst results. More investigation is
172     * needed. */
173    if (LLVMIsConstant(a) ||
174        type.width >= 16) {
175       /*
176        * Shuffle.
177        */
178       LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
179       LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
180 
181       for(j = 0; j < n; j += num_channels)
182          for(i = 0; i < num_channels; ++i)
183             shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0);
184 
185       return LLVMBuildShuffleVector(builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
186    }
187    else if (num_channels == 2) {
188       /*
189        * Bit mask and shifts
190        *
191        *   XY XY .... XY  <= input
192        *   0Y 0Y .... 0Y
193        *   YY YY .... YY
194        *   YY YY .... YY  <= output
195        */
196       struct lp_type type2;
197       LLVMValueRef tmp = NULL;
198       int shift;
199 
200       a = LLVMBuildAnd(builder, a,
201                        lp_build_const_mask_aos(bld->gallivm,
202                                                type, 1 << channel, num_channels), "");
203 
204       type2 = type;
205       type2.floating = FALSE;
206       type2.width *= 2;
207       type2.length /= 2;
208 
209       a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type2), "");
210 
211       /*
212        * Vector element 0 is always channel X.
213        *
214        *                        76 54 32 10 (array numbering)
215        * Little endian reg in:  YX YX YX YX
216        * Little endian reg out: YY YY YY YY if shift right (shift == -1)
217        *                        XX XX XX XX if shift left (shift == 1)
218        *
219        *                        01 23 45 67 (array numbering)
220        * Big endian reg in:     XY XY XY XY
221        * Big endian reg out:    YY YY YY YY if shift left (shift == 1)
222        *                        XX XX XX XX if shift right (shift == -1)
223        *
224        */
225 #if UTIL_ARCH_LITTLE_ENDIAN
226       shift = channel == 0 ? 1 : -1;
227 #else
228       shift = channel == 0 ? -1 : 1;
229 #endif
230 
231       if (shift > 0) {
232          tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type2, shift * type.width), "");
233       } else if (shift < 0) {
234          tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type2, -shift * type.width), "");
235       }
236 
237       assert(tmp);
238       if (tmp) {
239          a = LLVMBuildOr(builder, a, tmp, "");
240       }
241 
242       return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), "");
243    }
244    else {
245       /*
246        * Bit mask and recursive shifts
247        *
248        * Little-endian registers:
249        *
250        *   7654 3210
251        *   WZYX WZYX .... WZYX  <= input
252        *   00Y0 00Y0 .... 00Y0  <= mask
253        *   00YY 00YY .... 00YY  <= shift right 1 (shift amount -1)
254        *   YYYY YYYY .... YYYY  <= shift left 2 (shift amount 2)
255        *
256        * Big-endian registers:
257        *
258        *   0123 4567
259        *   XYZW XYZW .... XYZW  <= input
260        *   0Y00 0Y00 .... 0Y00  <= mask
261        *   YY00 YY00 .... YY00  <= shift left 1 (shift amount 1)
262        *   YYYY YYYY .... YYYY  <= shift right 2 (shift amount -2)
263        *
264        * shifts[] gives little-endian shift amounts; we need to negate for big-endian.
265        */
266       struct lp_type type4;
267       const int shifts[4][2] = {
268          { 1,  2},
269          {-1,  2},
270          { 1, -2},
271          {-1, -2}
272       };
273       unsigned i;
274 
275       a = LLVMBuildAnd(builder, a,
276                        lp_build_const_mask_aos(bld->gallivm,
277                                                type, 1 << channel, 4), "");
278 
279       /*
280        * Build a type where each element is an integer that cover the four
281        * channels.
282        */
283 
284       type4 = type;
285       type4.floating = FALSE;
286       type4.width *= 4;
287       type4.length /= 4;
288 
289       a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type4), "");
290 
291       for(i = 0; i < 2; ++i) {
292          LLVMValueRef tmp = NULL;
293          int shift = shifts[channel][i];
294 
295          /* See endianness diagram above */
296 #if UTIL_ARCH_BIG_ENDIAN
297          shift = -shift;
298 #endif
299 
300          if(shift > 0)
301             tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
302          if(shift < 0)
303             tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");
304 
305          assert(tmp);
306          if(tmp)
307             a = LLVMBuildOr(builder, a, tmp, "");
308       }
309 
310       return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), "");
311    }
312 }
313 
314 
315 /**
316  * Swizzle a vector consisting of an array of XYZW structs.
317  *
318  * This fills a vector of dst_len length with the swizzled channels from src.
319  *
320  * e.g. with swizzles = { 2, 1, 0 } and swizzle_count = 6 results in
321  *      RGBA RGBA = BGR BGR BG
322  *
323  * @param swizzles        the swizzle array
324  * @param num_swizzles    the number of elements in swizzles
325  * @param dst_len         the length of the result
326  */
327 LLVMValueRef
lp_build_swizzle_aos_n(struct gallivm_state * gallivm,LLVMValueRef src,const unsigned char * swizzles,unsigned num_swizzles,unsigned dst_len)328 lp_build_swizzle_aos_n(struct gallivm_state* gallivm,
329                        LLVMValueRef src,
330                        const unsigned char* swizzles,
331                        unsigned num_swizzles,
332                        unsigned dst_len)
333 {
334    LLVMBuilderRef builder = gallivm->builder;
335    LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH];
336    unsigned i;
337 
338    assert(dst_len < LP_MAX_VECTOR_WIDTH);
339 
340    for (i = 0; i < dst_len; ++i) {
341       int swizzle = swizzles[i % num_swizzles];
342 
343       if (swizzle == LP_BLD_SWIZZLE_DONTCARE) {
344          shuffles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
345       } else {
346          shuffles[i] = lp_build_const_int32(gallivm, swizzle);
347       }
348    }
349 
350    return LLVMBuildShuffleVector(builder, src, LLVMGetUndef(LLVMTypeOf(src)), LLVMConstVector(shuffles, dst_len), "");
351 }
352 
353 
354 LLVMValueRef
lp_build_swizzle_aos(struct lp_build_context * bld,LLVMValueRef a,const unsigned char swizzles[4])355 lp_build_swizzle_aos(struct lp_build_context *bld,
356                      LLVMValueRef a,
357                      const unsigned char swizzles[4])
358 {
359    LLVMBuilderRef builder = bld->gallivm->builder;
360    const struct lp_type type = bld->type;
361    const unsigned n = type.length;
362    unsigned i, j;
363 
364    if (swizzles[0] == PIPE_SWIZZLE_X &&
365        swizzles[1] == PIPE_SWIZZLE_Y &&
366        swizzles[2] == PIPE_SWIZZLE_Z &&
367        swizzles[3] == PIPE_SWIZZLE_W) {
368       return a;
369    }
370 
371    if (swizzles[0] == swizzles[1] &&
372        swizzles[1] == swizzles[2] &&
373        swizzles[2] == swizzles[3]) {
374       switch (swizzles[0]) {
375       case PIPE_SWIZZLE_X:
376       case PIPE_SWIZZLE_Y:
377       case PIPE_SWIZZLE_Z:
378       case PIPE_SWIZZLE_W:
379          return lp_build_swizzle_scalar_aos(bld, a, swizzles[0], 4);
380       case PIPE_SWIZZLE_0:
381          return bld->zero;
382       case PIPE_SWIZZLE_1:
383          return bld->one;
384       case LP_BLD_SWIZZLE_DONTCARE:
385          return bld->undef;
386       default:
387          assert(0);
388          return bld->undef;
389       }
390    }
391 
392    if (LLVMIsConstant(a) ||
393        type.width >= 16) {
394       /*
395        * Shuffle.
396        */
397       LLVMValueRef undef = LLVMGetUndef(lp_build_elem_type(bld->gallivm, type));
398       LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
399       LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
400       LLVMValueRef aux[LP_MAX_VECTOR_LENGTH];
401 
402       memset(aux, 0, sizeof aux);
403 
404       for(j = 0; j < n; j += 4) {
405          for(i = 0; i < 4; ++i) {
406             unsigned shuffle;
407             switch (swizzles[i]) {
408             default:
409                assert(0);
410                /* fall through */
411             case PIPE_SWIZZLE_X:
412             case PIPE_SWIZZLE_Y:
413             case PIPE_SWIZZLE_Z:
414             case PIPE_SWIZZLE_W:
415                shuffle = j + swizzles[i];
416                shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
417                break;
418             case PIPE_SWIZZLE_0:
419                shuffle = type.length + 0;
420                shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
421                if (!aux[0]) {
422                   aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0);
423                }
424                break;
425             case PIPE_SWIZZLE_1:
426                shuffle = type.length + 1;
427                shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
428                if (!aux[1]) {
429                   aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0);
430                }
431                break;
432             case LP_BLD_SWIZZLE_DONTCARE:
433                shuffles[j + i] = LLVMGetUndef(i32t);
434                break;
435             }
436          }
437       }
438 
439       for (i = 0; i < n; ++i) {
440          if (!aux[i]) {
441             aux[i] = undef;
442          }
443       }
444 
445       return LLVMBuildShuffleVector(builder, a,
446                                     LLVMConstVector(aux, n),
447                                     LLVMConstVector(shuffles, n), "");
448    } else {
449       /*
450        * Bit mask and shifts.
451        *
452        * For example, this will convert BGRA to RGBA by doing
453        *
454        * Little endian:
455        *   rgba = (bgra & 0x00ff0000) >> 16
456        *        | (bgra & 0xff00ff00)
457        *        | (bgra & 0x000000ff) << 16
458        *
459        * Big endian:A
460        *   rgba = (bgra & 0x0000ff00) << 16
461        *        | (bgra & 0x00ff00ff)
462        *        | (bgra & 0xff000000) >> 16
463        *
464        * This is necessary not only for faster cause, but because X86 backend
465        * will refuse shuffles of <4 x i8> vectors
466        */
467       LLVMValueRef res;
468       struct lp_type type4;
469       unsigned cond = 0;
470       int chan;
471       int shift;
472 
473       /*
474        * Start with a mixture of 1 and 0.
475        */
476       for (chan = 0; chan < 4; ++chan) {
477          if (swizzles[chan] == PIPE_SWIZZLE_1) {
478             cond |= 1 << chan;
479          }
480       }
481       res = lp_build_select_aos(bld, cond, bld->one, bld->zero, 4);
482 
483       /*
484        * Build a type where each element is an integer that cover the four
485        * channels.
486        */
487       type4 = type;
488       type4.floating = FALSE;
489       type4.width *= 4;
490       type4.length /= 4;
491 
492       a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type4), "");
493       res = LLVMBuildBitCast(builder, res, lp_build_vec_type(bld->gallivm, type4), "");
494 
495       /*
496        * Mask and shift the channels, trying to group as many channels in the
497        * same shift as possible.  The shift amount is positive for shifts left
498        * and negative for shifts right.
499        */
500       for (shift = -3; shift <= 3; ++shift) {
501          uint64_t mask = 0;
502 
503          assert(type4.width <= sizeof(mask)*8);
504 
505          /*
506           * Vector element numbers follow the XYZW order, so 0 is always X, etc.
507           * After widening 4 times we have:
508           *
509           *                                3210
510           * Little-endian register layout: WZYX
511           *
512           *                                0123
513           * Big-endian register layout:    XYZW
514           *
515           * For little-endian, higher-numbered channels are obtained by a shift right
516           * (negative shift amount) and lower-numbered channels by a shift left
517           * (positive shift amount).  The opposite is true for big-endian.
518           */
519          for (chan = 0; chan < 4; ++chan) {
520             if (swizzles[chan] < 4) {
521                /* We need to move channel swizzles[chan] into channel chan */
522 #if UTIL_ARCH_LITTLE_ENDIAN
523                if (swizzles[chan] - chan == -shift) {
524                   mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width);
525                }
526 #else
527                if (swizzles[chan] - chan == shift) {
528                   mask |= ((1ULL << type.width) - 1) << (type4.width - type.width) >> (swizzles[chan] * type.width);
529                }
530 #endif
531             }
532          }
533 
534          if (mask) {
535             LLVMValueRef masked;
536             LLVMValueRef shifted;
537             if (0)
538                debug_printf("shift = %i, mask = %" PRIx64 "\n", shift, mask);
539 
540             masked = LLVMBuildAnd(builder, a,
541                                   lp_build_const_int_vec(bld->gallivm, type4, mask), "");
542             if (shift > 0) {
543                shifted = LLVMBuildShl(builder, masked,
544                                       lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
545             } else if (shift < 0) {
546                shifted = LLVMBuildLShr(builder, masked,
547                                        lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");
548             } else {
549                shifted = masked;
550             }
551 
552             res = LLVMBuildOr(builder, res, shifted, "");
553          }
554       }
555 
556       return LLVMBuildBitCast(builder, res,
557                               lp_build_vec_type(bld->gallivm, type), "");
558    }
559 }
560 
561 
562 /**
563  * Extended swizzle of a single channel of a SoA vector.
564  *
565  * @param bld         building context
566  * @param unswizzled  array with the 4 unswizzled values
567  * @param swizzle     one of the PIPE_SWIZZLE_*
568  *
569  * @return  the swizzled value.
570  */
571 LLVMValueRef
lp_build_swizzle_soa_channel(struct lp_build_context * bld,const LLVMValueRef * unswizzled,unsigned swizzle)572 lp_build_swizzle_soa_channel(struct lp_build_context *bld,
573                              const LLVMValueRef *unswizzled,
574                              unsigned swizzle)
575 {
576    switch (swizzle) {
577    case PIPE_SWIZZLE_X:
578    case PIPE_SWIZZLE_Y:
579    case PIPE_SWIZZLE_Z:
580    case PIPE_SWIZZLE_W:
581       return unswizzled[swizzle];
582    case PIPE_SWIZZLE_0:
583       return bld->zero;
584    case PIPE_SWIZZLE_1:
585       return bld->one;
586    default:
587       assert(0);
588       return bld->undef;
589    }
590 }
591 
592 
593 /**
594  * Extended swizzle of a SoA vector.
595  *
596  * @param bld         building context
597  * @param unswizzled  array with the 4 unswizzled values
598  * @param swizzles    array of PIPE_SWIZZLE_*
599  * @param swizzled    output swizzled values
600  */
601 void
lp_build_swizzle_soa(struct lp_build_context * bld,const LLVMValueRef * unswizzled,const unsigned char swizzles[4],LLVMValueRef * swizzled)602 lp_build_swizzle_soa(struct lp_build_context *bld,
603                      const LLVMValueRef *unswizzled,
604                      const unsigned char swizzles[4],
605                      LLVMValueRef *swizzled)
606 {
607    unsigned chan;
608 
609    for (chan = 0; chan < 4; ++chan) {
610       swizzled[chan] = lp_build_swizzle_soa_channel(bld, unswizzled,
611                                                     swizzles[chan]);
612    }
613 }
614 
615 
616 /**
617  * Do an extended swizzle of a SoA vector inplace.
618  *
619  * @param bld         building context
620  * @param values      intput/output array with the 4 values
621  * @param swizzles    array of PIPE_SWIZZLE_*
622  */
623 void
lp_build_swizzle_soa_inplace(struct lp_build_context * bld,LLVMValueRef * values,const unsigned char swizzles[4])624 lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
625                              LLVMValueRef *values,
626                              const unsigned char swizzles[4])
627 {
628    LLVMValueRef unswizzled[4];
629    unsigned chan;
630 
631    for (chan = 0; chan < 4; ++chan) {
632       unswizzled[chan] = values[chan];
633    }
634 
635    lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
636 }
637 
638 
639 /**
640  * Transpose from AOS <-> SOA
641  *
642  * @param single_type_lp   type of pixels
643  * @param src              the 4 * n pixel input
644  * @param dst              the 4 * n pixel output
645  */
646 void
lp_build_transpose_aos(struct gallivm_state * gallivm,struct lp_type single_type_lp,const LLVMValueRef src[4],LLVMValueRef dst[4])647 lp_build_transpose_aos(struct gallivm_state *gallivm,
648                        struct lp_type single_type_lp,
649                        const LLVMValueRef src[4],
650                        LLVMValueRef dst[4])
651 {
652    struct lp_type double_type_lp = single_type_lp;
653    LLVMTypeRef single_type;
654    LLVMTypeRef double_type;
655    LLVMValueRef t0 = NULL, t1 = NULL, t2 = NULL, t3 = NULL;
656 
657    double_type_lp.length >>= 1;
658    double_type_lp.width  <<= 1;
659 
660    double_type = lp_build_vec_type(gallivm, double_type_lp);
661    single_type = lp_build_vec_type(gallivm, single_type_lp);
662 
663    LLVMValueRef double_type_zero = LLVMConstNull(double_type);
664    /* Interleave x, y, z, w -> xy and zw */
665    if (src[0] || src[1]) {
666       LLVMValueRef src0 = src[0];
667       LLVMValueRef src1 = src[1];
668       if (!src0)
669          src0 = LLVMConstNull(single_type);
670       if (!src1)
671          src1 = LLVMConstNull(single_type);
672       t0 = lp_build_interleave2_half(gallivm, single_type_lp, src0, src1, 0);
673       t2 = lp_build_interleave2_half(gallivm, single_type_lp, src0, src1, 1);
674 
675       /* Cast to double width type for second interleave */
676       t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0");
677       t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2");
678    }
679    if (src[2] || src[3]) {
680       LLVMValueRef src2 = src[2];
681       LLVMValueRef src3 = src[3];
682       if (!src2)
683          src2 = LLVMConstNull(single_type);
684       if (!src3)
685          src3 = LLVMConstNull(single_type);
686       t1 = lp_build_interleave2_half(gallivm, single_type_lp, src2, src3, 0);
687       t3 = lp_build_interleave2_half(gallivm, single_type_lp, src2, src3, 1);
688 
689       /* Cast to double width type for second interleave */
690       t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1");
691       t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3");
692    }
693 
694    if (!t0)
695       t0 = double_type_zero;
696    if (!t1)
697       t1 = double_type_zero;
698    if (!t2)
699       t2 = double_type_zero;
700    if (!t3)
701       t3 = double_type_zero;
702 
703    /* Interleave xy, zw -> xyzw */
704    dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0);
705    dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1);
706    dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0);
707    dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1);
708 
709    /* Cast back to original single width type */
710    dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0");
711    dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1");
712    dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2");
713    dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3");
714 }
715 
716 
717 /**
718  * Transpose from AOS <-> SOA for num_srcs
719  */
720 void
lp_build_transpose_aos_n(struct gallivm_state * gallivm,struct lp_type type,const LLVMValueRef * src,unsigned num_srcs,LLVMValueRef * dst)721 lp_build_transpose_aos_n(struct gallivm_state *gallivm,
722                          struct lp_type type,
723                          const LLVMValueRef* src,
724                          unsigned num_srcs,
725                          LLVMValueRef* dst)
726 {
727    switch (num_srcs) {
728       case 1:
729          dst[0] = src[0];
730          break;
731 
732       case 2:
733       {
734          /* Note: we must use a temporary incase src == dst */
735          LLVMValueRef lo, hi;
736 
737          lo = lp_build_interleave2_half(gallivm, type, src[0], src[1], 0);
738          hi = lp_build_interleave2_half(gallivm, type, src[0], src[1], 1);
739 
740          dst[0] = lo;
741          dst[1] = hi;
742          break;
743       }
744 
745       case 4:
746          lp_build_transpose_aos(gallivm, type, src, dst);
747          break;
748 
749       default:
750          assert(0);
751    }
752 }
753 
754 
755 /**
756  * Pack n-th element of aos values,
757  * pad out to destination size.
758  * i.e. x1 y1 _ _ x2 y2 _ _ will become x1 x2 _ _
759  */
760 LLVMValueRef
lp_build_pack_aos_scalars(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,const LLVMValueRef src,unsigned channel)761 lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
762                           struct lp_type src_type,
763                           struct lp_type dst_type,
764                           const LLVMValueRef src,
765                           unsigned channel)
766 {
767    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
768    LLVMValueRef undef = LLVMGetUndef(i32t);
769    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
770    unsigned num_src = src_type.length / 4;
771    unsigned num_dst = dst_type.length;
772    unsigned i;
773 
774    assert(num_src <= num_dst);
775 
776    for (i = 0; i < num_src; i++) {
777       shuffles[i] = LLVMConstInt(i32t, i * 4 + channel, 0);
778    }
779    for (i = num_src; i < num_dst; i++) {
780       shuffles[i] = undef;
781    }
782 
783    if (num_dst == 1) {
784       return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], "");
785    }
786    else {
787       return LLVMBuildShuffleVector(gallivm->builder, src, src,
788                                     LLVMConstVector(shuffles, num_dst), "");
789    }
790 }
791 
792 
793 /**
794  * Unpack and broadcast packed aos values consisting of only the
795  * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2
796  */
797 LLVMValueRef
lp_build_unpack_broadcast_aos_scalars(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,const LLVMValueRef src)798 lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
799                                       struct lp_type src_type,
800                                       struct lp_type dst_type,
801                                       const LLVMValueRef src)
802 {
803    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
804    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
805    unsigned num_dst = dst_type.length;
806    unsigned num_src = dst_type.length / 4;
807    unsigned i;
808 
809    assert(num_dst / 4 <= src_type.length);
810 
811    for (i = 0; i < num_src; i++) {
812       shuffles[i*4] = LLVMConstInt(i32t, i, 0);
813       shuffles[i*4+1] = LLVMConstInt(i32t, i, 0);
814       shuffles[i*4+2] = LLVMConstInt(i32t, i, 0);
815       shuffles[i*4+3] = LLVMConstInt(i32t, i, 0);
816    }
817 
818    if (num_src == 1) {
819       return lp_build_extract_broadcast(gallivm, src_type, dst_type,
820                                         src, shuffles[0]);
821    }
822    else {
823       return LLVMBuildShuffleVector(gallivm->builder, src, src,
824                                     LLVMConstVector(shuffles, num_dst), "");
825    }
826 }
827 
828