1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 #include "pipe/p_defines.h"
30 
31 #include "util/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34 #include "util/u_math.h"
35 
36 #include "lp_bld_type.h"
37 #include "lp_bld_const.h"
38 #include "lp_bld_conv.h"
39 #include "lp_bld_swizzle.h"
40 #include "lp_bld_gather.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_format.h"
43 #include "lp_bld_arit.h"
44 #include "lp_bld_pack.h"
45 
46 
47 static void
convert_to_soa(struct gallivm_state * gallivm,LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH/32],LLVMValueRef dst_soa[4],const struct lp_type soa_type)48 convert_to_soa(struct gallivm_state *gallivm,
49                LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
50                LLVMValueRef dst_soa[4],
51                const struct lp_type soa_type)
52 {
53    unsigned j, k;
54    struct lp_type aos_channel_type = soa_type;
55 
56    LLVMValueRef aos_channels[4];
57    unsigned pixels_per_channel = soa_type.length / 4;
58 
59    debug_assert((soa_type.length % 4) == 0);
60 
61    aos_channel_type.length >>= 1;
62 
63    for (j = 0; j < 4; ++j) {
64       LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
65 
66       assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
67 
68       for (k = 0; k < pixels_per_channel; ++k) {
69          channel[k] = src_aos[j + 4 * k];
70       }
71 
72       aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
73    }
74 
75    lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
76 }
77 
78 
79 void
lp_build_format_swizzle_soa(const struct util_format_description * format_desc,struct lp_build_context * bld,const LLVMValueRef * unswizzled,LLVMValueRef swizzled_out[4])80 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
81                             struct lp_build_context *bld,
82                             const LLVMValueRef *unswizzled,
83                             LLVMValueRef swizzled_out[4])
84 {
85    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
86       enum pipe_swizzle swizzle;
87       LLVMValueRef depth_or_stencil;
88 
89       if (util_format_has_stencil(format_desc) &&
90           !util_format_has_depth(format_desc)) {
91          assert(!bld->type.floating);
92          swizzle = format_desc->swizzle[1];
93       }
94       else {
95          assert(bld->type.floating);
96          swizzle = format_desc->swizzle[0];
97       }
98       /*
99        * Return zzz1 or sss1 for depth-stencil formats here.
100        * Correct swizzling will be handled by apply_sampler_swizzle() later.
101        */
102       depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
103 
104       swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
105       swizzled_out[3] = bld->one;
106    }
107    else {
108       unsigned chan;
109       for (chan = 0; chan < 4; ++chan) {
110          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
111          swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
112       }
113    }
114 }
115 
116 
117 
118 static LLVMValueRef
lp_build_extract_soa_chan(struct lp_build_context * bld,unsigned blockbits,boolean srgb_chan,struct util_format_channel_description chan_desc,LLVMValueRef packed)119 lp_build_extract_soa_chan(struct lp_build_context *bld,
120                           unsigned blockbits,
121                           boolean srgb_chan,
122                           struct util_format_channel_description chan_desc,
123                           LLVMValueRef packed)
124 {
125    struct gallivm_state *gallivm = bld->gallivm;
126    LLVMBuilderRef builder = gallivm->builder;
127    struct lp_type type = bld->type;
128    LLVMValueRef input = packed;
129    const unsigned width = chan_desc.size;
130    const unsigned start = chan_desc.shift;
131    const unsigned stop = start + width;
132 
133    /* Decode the input vector component */
134 
135    switch(chan_desc.type) {
136    case UTIL_FORMAT_TYPE_VOID:
137       input = bld->undef;
138       break;
139 
140    case UTIL_FORMAT_TYPE_UNSIGNED:
141       /*
142        * Align the LSB
143        */
144       if (start) {
145          input = LLVMBuildLShr(builder, input,
146                                lp_build_const_int_vec(gallivm, type, start), "");
147       }
148 
149       /*
150        * Zero the MSBs
151        */
152       if (stop < blockbits) {
153          unsigned mask = ((unsigned long long)1 << width) - 1;
154          input = LLVMBuildAnd(builder, input,
155                               lp_build_const_int_vec(gallivm, type, mask), "");
156       }
157 
158       /*
159        * Type conversion
160        */
161       if (type.floating) {
162          if (srgb_chan) {
163             struct lp_type conv_type = lp_uint_type(type);
164             input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
165          }
166          else {
167             if(chan_desc.normalized)
168                input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
169             else
170                input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
171          }
172       }
173       else if (chan_desc.pure_integer) {
174          /* Nothing to do */
175       } else {
176           /* FIXME */
177           assert(0);
178       }
179       break;
180 
181    case UTIL_FORMAT_TYPE_SIGNED:
182       /*
183        * Align the sign bit first.
184        */
185       if (stop < type.width) {
186          unsigned bits = type.width - stop;
187          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
188          input = LLVMBuildShl(builder, input, bits_val, "");
189       }
190 
191       /*
192        * Align the LSB (with an arithmetic shift to preserve the sign)
193        */
194       if (chan_desc.size < type.width) {
195          unsigned bits = type.width - chan_desc.size;
196          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
197          input = LLVMBuildAShr(builder, input, bits_val, "");
198       }
199 
200       /*
201        * Type conversion
202        */
203       if (type.floating) {
204          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
205          if (chan_desc.normalized) {
206             double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
207             LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
208             input = LLVMBuildFMul(builder, input, scale_val, "");
209             /*
210              * The formula above will produce value below -1.0 for most negative
211              * value but everything seems happy with that hence disable for now.
212              */
213             if (0)
214                input = lp_build_max(bld, input,
215                                     lp_build_const_vec(gallivm, type, -1.0f));
216          }
217       }
218       else if (chan_desc.pure_integer) {
219          /* Nothing to do */
220       } else {
221           /* FIXME */
222           assert(0);
223       }
224       break;
225 
226    case UTIL_FORMAT_TYPE_FLOAT:
227       if (type.floating) {
228          if (chan_desc.size == 16) {
229             struct lp_type f16i_type = type;
230             f16i_type.width /= 2;
231             f16i_type.floating = 0;
232             if (start) {
233                input = LLVMBuildLShr(builder, input,
234                                      lp_build_const_int_vec(gallivm, type, start), "");
235             }
236             input = LLVMBuildTrunc(builder, input,
237                                    lp_build_vec_type(gallivm, f16i_type), "");
238             input = lp_build_half_to_float(gallivm, input);
239          } else {
240             assert(start == 0);
241             assert(stop == 32);
242             assert(type.width == 32);
243          }
244          input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
245       }
246       else {
247          /* FIXME */
248          assert(0);
249          input = bld->undef;
250       }
251       break;
252 
253    case UTIL_FORMAT_TYPE_FIXED:
254       if (type.floating) {
255          double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
256          LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
257          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
258          input = LLVMBuildFMul(builder, input, scale_val, "");
259       }
260       else {
261          /* FIXME */
262          assert(0);
263          input = bld->undef;
264       }
265       break;
266 
267    default:
268       assert(0);
269       input = bld->undef;
270       break;
271    }
272 
273    return input;
274 }
275 
276 
277 /**
278  * Unpack several pixels in SoA.
279  *
280  * It takes a vector of packed pixels:
281  *
282  *   packed = {P0, P1, P2, P3, ..., Pn}
283  *
284  * And will produce four vectors:
285  *
286  *   red    = {R0, R1, R2, R3, ..., Rn}
287  *   green  = {G0, G1, G2, G3, ..., Gn}
288  *   blue   = {B0, B1, B2, B3, ..., Bn}
289  *   alpha  = {A0, A1, A2, A3, ..., An}
290  *
291  * It requires that a packed pixel fits into an element of the output
292  * channels. The common case is when converting pixel with a depth of 32 bit or
293  * less into floats.
294  *
295  * \param format_desc  the format of the 'packed' incoming pixel vector
296  * \param type  the desired type for rgba_out (type.length = n, above)
297  * \param packed  the incoming vector of packed pixels
298  * \param rgba_out  returns the SoA R,G,B,A vectors
299  */
300 void
lp_build_unpack_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,LLVMValueRef packed,LLVMValueRef rgba_out[4])301 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
302                          const struct util_format_description *format_desc,
303                          struct lp_type type,
304                          LLVMValueRef packed,
305                          LLVMValueRef rgba_out[4])
306 {
307    struct lp_build_context bld;
308    LLVMValueRef inputs[4];
309    unsigned chan;
310 
311    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
312    assert(format_desc->block.width == 1);
313    assert(format_desc->block.height == 1);
314    assert(format_desc->block.bits <= type.width);
315    /* FIXME: Support more output types */
316    assert(type.width == 32);
317 
318    lp_build_context_init(&bld, gallivm, type);
319 
320    /* Decode the input vector components */
321    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
322       struct util_format_channel_description chan_desc = format_desc->channel[chan];
323       boolean srgb_chan = FALSE;
324 
325       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
326           format_desc->swizzle[3] != chan) {
327          srgb_chan = TRUE;
328       }
329 
330       inputs[chan] = lp_build_extract_soa_chan(&bld,
331                                                format_desc->block.bits,
332                                                srgb_chan,
333                                                chan_desc,
334                                                packed);
335    }
336 
337    lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
338 }
339 
340 
341 /**
342  * Convert a vector of rgba8 values into 32bit wide SoA vectors.
343  *
344  * \param dst_type  The desired return type. For pure integer formats
345  *                  this should be a 32bit wide int or uint vector type,
346  *                  otherwise a float vector type.
347  *
348  * \param packed    The rgba8 values to pack.
349  *
350  * \param rgba      The 4 SoA return vectors.
351  */
352 void
lp_build_rgba8_to_fi32_soa(struct gallivm_state * gallivm,struct lp_type dst_type,LLVMValueRef packed,LLVMValueRef * rgba)353 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
354                            struct lp_type dst_type,
355                            LLVMValueRef packed,
356                            LLVMValueRef *rgba)
357 {
358    LLVMBuilderRef builder = gallivm->builder;
359    LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
360    unsigned chan;
361 
362    /* XXX technically shouldn't use that for uint dst_type */
363    packed = LLVMBuildBitCast(builder, packed,
364                              lp_build_int_vec_type(gallivm, dst_type), "");
365 
366    /* Decode the input vector components */
367    for (chan = 0; chan < 4; ++chan) {
368 #ifdef PIPE_ARCH_LITTLE_ENDIAN
369       unsigned start = chan*8;
370 #else
371       unsigned start = (3-chan)*8;
372 #endif
373       unsigned stop = start + 8;
374       LLVMValueRef input;
375 
376       input = packed;
377 
378       if (start)
379          input = LLVMBuildLShr(builder, input,
380                                lp_build_const_int_vec(gallivm, dst_type, start), "");
381 
382       if (stop < 32)
383          input = LLVMBuildAnd(builder, input, mask, "");
384 
385       if (dst_type.floating)
386          input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
387 
388       rgba[chan] = input;
389    }
390 }
391 
392 
393 
394 /**
395  * Fetch a texels from a texture, returning them in SoA layout.
396  *
397  * \param type  the desired return type for 'rgba'.  The vector length
398  *              is the number of texels to fetch
399  * \param aligned if the offset is guaranteed to be aligned to element width
400  *
401  * \param base_ptr  points to the base of the texture mip tree.
402  * \param offset    offset to start of the texture image block.  For non-
403  *                  compressed formats, this simply is an offset to the texel.
404  *                  For compressed formats, it is an offset to the start of the
405  *                  compressed data block.
406  *
407  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
408  *              these will always be (0,0).  For compressed formats, i will
409  *              be in [0, block_width-1] and j will be in [0, block_height-1].
410  * \param cache  optional value pointing to a lp_build_format_cache structure
411  */
412 void
lp_build_fetch_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache,LLVMValueRef rgba_out[4])413 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
414                         const struct util_format_description *format_desc,
415                         struct lp_type type,
416                         boolean aligned,
417                         LLVMValueRef base_ptr,
418                         LLVMValueRef offset,
419                         LLVMValueRef i,
420                         LLVMValueRef j,
421                         LLVMValueRef cache,
422                         LLVMValueRef rgba_out[4])
423 {
424    LLVMBuilderRef builder = gallivm->builder;
425    enum pipe_format format = format_desc->format;
426    struct lp_type fetch_type;
427 
428    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
429        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
430         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
431         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
432        format_desc->block.width == 1 &&
433        format_desc->block.height == 1 &&
434        format_desc->block.bits <= type.width &&
435        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
436         format_desc->channel[0].size == 32 ||
437         format_desc->channel[0].size == 16))
438    {
439       /*
440        * The packed pixel fits into an element of the destination format. Put
441        * the packed pixels into a vector and extract each component for all
442        * vector elements in parallel.
443        */
444 
445       LLVMValueRef packed;
446 
447       /*
448        * gather the texels from the texture
449        * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
450        */
451       assert(format_desc->block.bits <= type.width);
452       fetch_type = lp_type_uint(type.width);
453       packed = lp_build_gather(gallivm,
454                                type.length,
455                                format_desc->block.bits,
456                                fetch_type,
457                                aligned,
458                                base_ptr, offset, FALSE);
459 
460       /*
461        * convert texels to float rgba
462        */
463       lp_build_unpack_rgba_soa(gallivm,
464                                format_desc,
465                                type,
466                                packed, rgba_out);
467       return;
468    }
469 
470 
471    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
472        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
473        format_desc->block.width == 1 &&
474        format_desc->block.height == 1 &&
475        format_desc->block.bits > type.width &&
476        ((format_desc->block.bits <= type.width * type.length &&
477          format_desc->channel[0].size <= type.width) ||
478         (format_desc->channel[0].size == 64 &&
479          format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
480          type.floating)))
481    {
482       /*
483        * Similar to above, but the packed pixel is larger than what fits
484        * into an element of the destination format. The packed pixels will be
485        * shuffled into SoA vectors appropriately, and then the extraction will
486        * be done in parallel as much as possible.
487        * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
488        * the gathered vectors can be shuffled easily (even with avx).
489        * 64xn float -> 32xn float is handled too but it's a bit special as
490        * it does the conversion pre-shuffle.
491        */
492 
493       LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
494       struct lp_type fetch_type, gather_type = type;
495       unsigned num_gather, fetch_width, i, j;
496       struct lp_build_context bld;
497       boolean fp64 = format_desc->channel[0].size == 64;
498 
499       lp_build_context_init(&bld, gallivm, type);
500 
501       assert(type.width == 32);
502       assert(format_desc->block.bits > type.width);
503 
504       /*
505        * First, figure out fetch order.
506        */
507       fetch_width = util_next_power_of_two(format_desc->block.bits);
508       /*
509        * fp64 are treated like fp32 except we fetch twice wide values
510        * (as we shuffle after trunc). The shuffles for that work out
511        * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
512        * albeit we miss the potential opportunity for hw gather (as it
513        * only handles native size).
514        */
515       num_gather = fetch_width / type.width;
516       gather_type.width *= num_gather;
517       if (fp64) {
518          num_gather /= 2;
519       }
520       gather_type.length /= num_gather;
521 
522       for (i = 0; i < num_gather; i++) {
523          LLVMValueRef offsetr, shuf_vec;
524          if(num_gather == 4) {
525             for (j = 0; j < gather_type.length; j++) {
526                unsigned idx = i + 4*j;
527                shuffles[j] = lp_build_const_int32(gallivm, idx);
528             }
529             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
530             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
531 
532          }
533          else if (num_gather == 2) {
534             assert(num_gather == 2);
535             for (j = 0; j < gather_type.length; j++) {
536                unsigned idx = i*2 + (j%2) + (j/2)*4;
537                shuffles[j] = lp_build_const_int32(gallivm, idx);
538             }
539             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
540             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
541          }
542          else {
543             assert(num_gather == 1);
544             offsetr = offset;
545          }
546          if (gather_type.length == 1) {
547             LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
548             offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
549          }
550 
551          /*
552           * Determine whether to use float or int loads. This is mostly
553           * to outsmart the (stupid) llvm int/float shuffle logic, we
554           * don't really care much if the data is floats or ints...
555           * But llvm will refuse to use single float shuffle with int data
556           * and instead use 3 int shuffles instead, the code looks atrocious.
557           * (Note bitcasts often won't help, as llvm is too smart to be
558           * fooled by that.)
559           * Nobody cares about simd float<->int domain transition penalties,
560           * which usually don't even exist for shuffles anyway.
561           * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
562           * going into transpose, which is unpacks, so doesn't really matter
563           * much).
564           * With 2x32bit or 4x16bit fetch, we use float vec, since those
565           * go into the weird channel separation shuffle. With floats,
566           * this is (with 128bit vectors):
567           * - 2 movq, 2 movhpd, 2 shufps
568           * With ints it would be:
569           * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
570           * I've seen texture functions increase in code size by 15% just due
571           * to that (there's lots of such fetches in them...)
572           * (We could chose a different gather order to improve this somewhat
573           * for the int path, but it would basically just drop the blends,
574           * so the float path with this order really is optimal.)
575           * Albeit it is tricky sometimes llvm doesn't ignore the float->int
576           * casts so must avoid them until we're done with the float shuffle...
577           * 3x16bit formats (the same is also true for 3x8) are pretty bad but
578           * there's nothing we can do about them (we could overallocate by
579           * those couple bytes and use unaligned but pot sized load).
580           * Note that this is very much x86 specific. I don't know if this
581           * affect other archs at all.
582           */
583          if (num_gather > 1) {
584             /*
585              * We always want some float type here (with x86)
586              * due to shuffles being float ones afterwards (albeit for
587              * the num_gather == 4 case int should work fine too
588              * (unless there's some problems with avx but not avx2).
589              */
590             if (format_desc->channel[0].size == 64) {
591                fetch_type = lp_type_float_vec(64, gather_type.width);
592             } else {
593                fetch_type = lp_type_int_vec(32, gather_type.width);
594             }
595          }
596          else {
597             /* type doesn't matter much */
598             if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
599                 (format_desc->channel[0].size == 32 ||
600                  format_desc->channel[0].size == 64)) {
601             fetch_type = lp_type_float(gather_type.width);
602             } else {
603                fetch_type = lp_type_uint(gather_type.width);
604             }
605          }
606 
607          /* Now finally gather the values */
608          packed[i] = lp_build_gather(gallivm, gather_type.length,
609                                      format_desc->block.bits,
610                                      fetch_type, aligned,
611                                      base_ptr, offsetr, FALSE);
612          if (fp64) {
613             struct lp_type conv_type = type;
614             conv_type.width *= 2;
615             packed[i] = LLVMBuildBitCast(builder, packed[i],
616                                          lp_build_vec_type(gallivm, conv_type), "");
617             packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
618          }
619       }
620 
621       /* shuffle the gathered values to SoA */
622       if (num_gather == 2) {
623          for (i = 0; i < num_gather; i++) {
624             for (j = 0; j < type.length; j++) {
625                unsigned idx = (j%2)*2 + (j/4)*4 + i;
626                if ((j/2)%2)
627                   idx += type.length;
628                shuffles[j] = lp_build_const_int32(gallivm, idx);
629             }
630             dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
631                                             LLVMConstVector(shuffles, type.length), "");
632          }
633       }
634       else if (num_gather == 4) {
635          lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
636       }
637       else {
638          assert(num_gather == 1);
639          dst[0] = packed[0];
640       }
641 
642       /*
643        * And finally unpack exactly as above, except that
644        * chan shift is adjusted and the right vector selected.
645        */
646       if (!fp64) {
647          for (i = 0; i < num_gather; i++) {
648             dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
649          }
650          for (i = 0; i < format_desc->nr_channels; i++) {
651             struct util_format_channel_description chan_desc = format_desc->channel[i];
652             unsigned blockbits = type.width;
653             unsigned vec_nr;
654 
655 #ifdef PIPE_ARCH_BIG_ENDIAN
656             vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
657 #else
658             vec_nr = chan_desc.shift / type.width;
659 #endif
660             chan_desc.shift %= type.width;
661 
662             output[i] = lp_build_extract_soa_chan(&bld,
663                                                   blockbits,
664                                                   FALSE,
665                                                   chan_desc,
666                                                   dst[vec_nr]);
667          }
668       }
669       else {
670          for (i = 0; i < format_desc->nr_channels; i++)  {
671             output[i] = dst[i];
672          }
673       }
674 
675       lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
676       return;
677    }
678 
679    if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
680        format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
681       /*
682        * similar conceptually to above but requiring special
683        * AoS packed -> SoA float conversion code.
684        */
685       LLVMValueRef packed;
686       struct lp_type fetch_type = lp_type_uint(type.width);
687 
688       assert(type.floating);
689       assert(type.width == 32);
690 
691       packed = lp_build_gather(gallivm, type.length,
692                                format_desc->block.bits,
693                                fetch_type, aligned,
694                                base_ptr, offset, FALSE);
695       if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
696          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
697       }
698       else {
699          lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
700       }
701       return;
702    }
703 
704    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
705        format_desc->block.bits == 64) {
706       /*
707        * special case the format is 64 bits but we only require
708        * 32bit (or 8bit) from each block.
709        */
710       LLVMValueRef packed;
711       struct lp_type fetch_type = lp_type_uint(type.width);
712 
713       if (format == PIPE_FORMAT_X32_S8X24_UINT) {
714          /*
715           * for stencil simply fix up offsets - could in fact change
716           * base_ptr instead even outside the shader.
717           */
718          unsigned mask = (1 << 8) - 1;
719          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
720          offset = LLVMBuildAdd(builder, offset, s_offset, "");
721          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
722                                   aligned, base_ptr, offset, FALSE);
723          packed = LLVMBuildAnd(builder, packed,
724                                lp_build_const_int_vec(gallivm, type, mask), "");
725       }
726       else {
727          assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
728          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
729                                   aligned, base_ptr, offset, TRUE);
730          packed = LLVMBuildBitCast(builder, packed,
731                                    lp_build_vec_type(gallivm, type), "");
732       }
733       /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
734       rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
735       rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
736       return;
737    }
738 
739    /*
740     * Try calling lp_build_fetch_rgba_aos for all pixels.
741     * Should only really hit subsampled, compressed
742     * (for s3tc srgb too, for rgtc the unorm ones only) by now.
743     * (This is invalid for plain 8unorm formats because we're lazy with
744     * the swizzle since some results would arrive swizzled, some not.)
745     */
746 
747    if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
748        (util_format_fits_8unorm(format_desc) ||
749         format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
750        type.floating && type.width == 32 &&
751        (type.length == 1 || (type.length % 4 == 0))) {
752       struct lp_type tmp_type;
753       struct lp_build_context bld;
754       LLVMValueRef packed, rgba[4];
755       const struct util_format_description *flinear_desc;
756       const struct util_format_description *frgba8_desc;
757       unsigned chan;
758 
759       lp_build_context_init(&bld, gallivm, type);
760 
761       /*
762        * Make sure the conversion in aos really only does convert to rgba8
763        * and not anything more (so use linear format, adjust type).
764        */
765       flinear_desc = util_format_description(util_format_linear(format));
766       memset(&tmp_type, 0, sizeof tmp_type);
767       tmp_type.width = 8;
768       tmp_type.length = type.length * 4;
769       tmp_type.norm = TRUE;
770 
771       packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
772                                        aligned, base_ptr, offset, i, j, cache);
773       packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
774 
775       /*
776        * The values are now packed so they match ordinary (srgb) RGBA8 format,
777        * hence need to use matching format for unpack.
778        */
779       frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
780       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
781          assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
782          frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
783       }
784       lp_build_unpack_rgba_soa(gallivm,
785                                frgba8_desc,
786                                type,
787                                packed, rgba);
788 
789       /*
790        * We converted 4 channels. Make sure llvm can drop unneeded ones
791        * (luckily the rgba order is fixed, only LA needs special case).
792        */
793       for (chan = 0; chan < 4; chan++) {
794          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
795          if (chan == 3 && util_format_is_luminance_alpha(format)) {
796             swizzle = PIPE_SWIZZLE_W;
797          }
798          rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
799       }
800       return;
801    }
802 
803 
804    /*
805     * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
806     *
807     * This is not the most efficient way of fetching pixels, as we
808     * miss some opportunities to do vectorization, but this is
809     * convenient for formats or scenarios for which there was no
810     * opportunity or incentive to optimize.
811     *
812     * We do NOT want to end up here, this typically is quite terrible,
813     * in particular if the formats have less than 4 channels.
814     *
815     * Right now, this should only be hit for:
816     * - RGTC snorm formats
817     *   (those miss fast fetch functions hence they are terrible anyway)
818     */
819 
820    {
821       unsigned k;
822       struct lp_type tmp_type;
823       LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
824 
825       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
826          debug_printf("%s: AoS fetch fallback for %s\n",
827                       __FUNCTION__, format_desc->short_name);
828       }
829 
830       tmp_type = type;
831       tmp_type.length = 4;
832 
833       /*
834        * Note that vector transpose can be worse compared to insert/extract
835        * for aos->soa conversion (for formats with 1 or 2 channels). However,
836        * we should try to avoid getting here for just about all formats, so
837        * don't bother.
838        */
839 
840       /* loop over number of pixels */
841       for(k = 0; k < type.length; ++k) {
842          LLVMValueRef index = lp_build_const_int32(gallivm, k);
843          LLVMValueRef offset_elem;
844          LLVMValueRef i_elem, j_elem;
845 
846          offset_elem = LLVMBuildExtractElement(builder, offset,
847                                                index, "");
848 
849          i_elem = LLVMBuildExtractElement(builder, i, index, "");
850          j_elem = LLVMBuildExtractElement(builder, j, index, "");
851 
852          /* Get a single float[4]={R,G,B,A} pixel */
853          aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
854                                                 aligned, base_ptr, offset_elem,
855                                                 i_elem, j_elem, cache);
856 
857       }
858       convert_to_soa(gallivm, aos_fetch, rgba_out, type);
859    }
860 }
861