1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /**
29  * @file
30  * AoS pixel format manipulation.
31  *
32  * @author Jose Fonseca <jfonseca@vmware.com>
33  */
34 
35 
36 #include "util/u_format.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include "util/u_pointer.h"
40 #include "util/u_string.h"
41 #include "util/u_cpu_detect.h"
42 
43 #include "lp_bld_arit.h"
44 #include "lp_bld_init.h"
45 #include "lp_bld_type.h"
46 #include "lp_bld_flow.h"
47 #include "lp_bld_const.h"
48 #include "lp_bld_conv.h"
49 #include "lp_bld_swizzle.h"
50 #include "lp_bld_gather.h"
51 #include "lp_bld_debug.h"
52 #include "lp_bld_format.h"
53 #include "lp_bld_pack.h"
54 #include "lp_bld_intr.h"
55 #include "lp_bld_logic.h"
56 #include "lp_bld_bitarit.h"
57 
58 
59 /**
60  * Basic swizzling.  Rearrange the order of the unswizzled array elements
61  * according to the format description.  PIPE_SWIZZLE_0/ONE are supported
62  * too.
63  * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
64  */
65 LLVMValueRef
lp_build_format_swizzle_aos(const struct util_format_description * desc,struct lp_build_context * bld,LLVMValueRef unswizzled)66 lp_build_format_swizzle_aos(const struct util_format_description *desc,
67                             struct lp_build_context *bld,
68                             LLVMValueRef unswizzled)
69 {
70    unsigned char swizzles[4];
71    unsigned chan;
72 
73    assert(bld->type.length % 4 == 0);
74 
75    for (chan = 0; chan < 4; ++chan) {
76       enum pipe_swizzle swizzle;
77 
78       if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
79          /*
80           * For ZS formats do RGBA = ZZZ1
81           */
82          if (chan == 3) {
83             swizzle = PIPE_SWIZZLE_1;
84          } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) {
85             swizzle = PIPE_SWIZZLE_0;
86          } else {
87             swizzle = desc->swizzle[0];
88          }
89       } else {
90          swizzle = desc->swizzle[chan];
91       }
92       swizzles[chan] = swizzle;
93    }
94 
95    return lp_build_swizzle_aos(bld, unswizzled, swizzles);
96 }
97 
98 
99 /**
100  * Whether the format matches the vector type, apart of swizzles.
101  */
102 static inline boolean
format_matches_type(const struct util_format_description * desc,struct lp_type type)103 format_matches_type(const struct util_format_description *desc,
104                     struct lp_type type)
105 {
106    enum util_format_type chan_type;
107    unsigned chan;
108 
109    assert(type.length % 4 == 0);
110 
111    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
112        desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
113        desc->block.width != 1 ||
114        desc->block.height != 1) {
115       return FALSE;
116    }
117 
118    if (type.floating) {
119       chan_type = UTIL_FORMAT_TYPE_FLOAT;
120    } else if (type.fixed) {
121       chan_type = UTIL_FORMAT_TYPE_FIXED;
122    } else if (type.sign) {
123       chan_type = UTIL_FORMAT_TYPE_SIGNED;
124    } else {
125       chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
126    }
127 
128    for (chan = 0; chan < desc->nr_channels; ++chan) {
129       if (desc->channel[chan].size != type.width) {
130          return FALSE;
131       }
132 
133       if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
134          if (desc->channel[chan].type != chan_type ||
135              desc->channel[chan].normalized != type.norm) {
136             return FALSE;
137          }
138       }
139    }
140 
141    return TRUE;
142 }
143 
144 /*
145  * Do rounding when converting small unorm values to larger ones.
146  * Not quite 100% accurate, as it's done by appending MSBs, but
147  * should be good enough.
148  */
149 
150 static inline LLVMValueRef
scale_bits_up(struct gallivm_state * gallivm,int src_bits,int dst_bits,LLVMValueRef src,struct lp_type src_type)151 scale_bits_up(struct gallivm_state *gallivm,
152               int src_bits,
153               int dst_bits,
154               LLVMValueRef src,
155               struct lp_type src_type)
156 {
157    LLVMBuilderRef builder = gallivm->builder;
158    LLVMValueRef result = src;
159 
160    if (src_bits == 1 && dst_bits > 1) {
161       /*
162        * Useful for a1 - we'd need quite some repeated copies otherwise.
163        */
164       struct lp_build_context bld;
165       LLVMValueRef dst_mask;
166       lp_build_context_init(&bld, gallivm, src_type);
167       dst_mask = lp_build_const_int_vec(gallivm, src_type,
168                                         (1 << dst_bits) - 1),
169       result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
170                             lp_build_const_int_vec(gallivm, src_type, 0));
171       result = lp_build_andnot(&bld, dst_mask, result);
172    }
173    else if (dst_bits > src_bits) {
174       /* Scale up bits */
175       int db = dst_bits - src_bits;
176 
177       /* Shift left by difference in bits */
178       result = LLVMBuildShl(builder,
179                             src,
180                             lp_build_const_int_vec(gallivm, src_type, db),
181                             "");
182 
183       if (db <= src_bits) {
184          /* Enough bits in src to fill the remainder */
185          LLVMValueRef lower = LLVMBuildLShr(builder,
186                                             src,
187                                             lp_build_const_int_vec(gallivm, src_type,
188                                                                    src_bits - db),
189                                             "");
190 
191          result = LLVMBuildOr(builder, result, lower, "");
192       } else if (db > src_bits) {
193          /* Need to repeatedly copy src bits to fill remainder in dst */
194          unsigned n;
195 
196          for (n = src_bits; n < dst_bits; n *= 2) {
197             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
198 
199             result = LLVMBuildOr(builder,
200                                  result,
201                                  LLVMBuildLShr(builder, result, shuv, ""),
202                                  "");
203          }
204       }
205    } else {
206       assert (dst_bits == src_bits);
207    }
208 
209    return result;
210 }
211 
212 /**
213  * Unpack a single pixel into its XYZW components.
214  *
215  * @param desc  the pixel format for the packed pixel value
216  * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
217  *
218  * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
219  */
220 static inline LLVMValueRef
lp_build_unpack_arith_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * desc,LLVMValueRef packed)221 lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
222                                const struct util_format_description *desc,
223                                LLVMValueRef packed)
224 {
225    LLVMBuilderRef builder = gallivm->builder;
226    LLVMValueRef shifted, casted, scaled, masked;
227    LLVMValueRef shifts[4];
228    LLVMValueRef masks[4];
229    LLVMValueRef scales[4];
230    LLVMTypeRef vec32_type;
231 
232    boolean normalized;
233    boolean needs_uitofp;
234    unsigned i;
235 
236    /* TODO: Support more formats */
237    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
238    assert(desc->block.width == 1);
239    assert(desc->block.height == 1);
240    assert(desc->block.bits <= 32);
241 
242    /* Do the intermediate integer computations with 32bit integers since it
243     * matches floating point size */
244    assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
245 
246    vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
247 
248    /* Broadcast the packed value to all four channels
249     * before: packed = BGRA
250     * after: packed = {BGRA, BGRA, BGRA, BGRA}
251     */
252    packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
253                                    LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
254                                    "");
255    packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
256                                    LLVMConstNull(vec32_type),
257                                    "");
258 
259    /* Initialize vector constants */
260    normalized = FALSE;
261    needs_uitofp = FALSE;
262 
263    /* Loop over 4 color components */
264    for (i = 0; i < 4; ++i) {
265       unsigned bits = desc->channel[i].size;
266       unsigned shift = desc->channel[i].shift;
267 
268       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
269          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
270          masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
271          scales[i] =  LLVMConstNull(LLVMFloatTypeInContext(gallivm->context));
272       }
273       else {
274          unsigned long long mask = (1ULL << bits) - 1;
275 
276          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
277 
278          if (bits == 32) {
279             needs_uitofp = TRUE;
280          }
281 
282          shifts[i] = lp_build_const_int32(gallivm, shift);
283          masks[i] = lp_build_const_int32(gallivm, mask);
284 
285          if (desc->channel[i].normalized) {
286             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
287             normalized = TRUE;
288          }
289          else
290             scales[i] =  lp_build_const_float(gallivm, 1.0);
291       }
292    }
293 
294    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
295     * into masked = {X, Y, Z, W}
296     */
297    if (desc->block.bits < 32 && normalized) {
298       /*
299        * Note: we cannot do the shift below on x86 natively until AVX2.
300        *
301        * Old llvm versions will resort to scalar extract/shift insert,
302        * which is definitely terrible, new versions will just do
303        * several vector shifts and shuffle/blend results together.
304        * We could turn this into a variable left shift plus a constant
305        * right shift, and llvm would then turn the variable left shift
306        * into a mul for us (albeit without sse41 the mul needs emulation
307        * too...). However, since we're going to do a float mul
308        * anyway, we just adjust that mul instead (plus the mask), skipping
309        * the shift completely.
310        * We could also use a extra mul when the format isn't normalized and
311        * we don't have AVX2 support, but don't bother for now. Unfortunately,
312        * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
313        * rgba8 if it ends up here), as that would require UIToFP, albeit that
314        * would be fixable with easy 16bit shuffle (unless there's channels
315        * crossing 16bit boundaries).
316        */
317       for (i = 0; i < 4; ++i) {
318          if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
319             unsigned bits = desc->channel[i].size;
320             unsigned shift = desc->channel[i].shift;
321             unsigned long long mask = ((1ULL << bits) - 1) << shift;
322             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
323             masks[i] = lp_build_const_int32(gallivm, mask);
324          }
325       }
326       masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
327    } else {
328       shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
329       masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
330    }
331 
332    if (!needs_uitofp) {
333       /* UIToFP can't be expressed in SSE2 */
334       casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
335    } else {
336       casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
337    }
338 
339    /*
340     * At this point 'casted' may be a vector of floats such as
341     * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
342     * by powers of two). Next, if the pixel values are normalized
343     * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
344     */
345 
346    if (normalized)
347       scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), "");
348    else
349       scaled = casted;
350 
351    return scaled;
352 }
353 
354 
355 /**
356  * Pack a single pixel.
357  *
358  * @param rgba 4 float vector with the unpacked components.
359  *
360  * XXX: This is mostly for reference and testing -- operating a single pixel at
361  * a time is rarely if ever needed.
362  */
363 LLVMValueRef
lp_build_pack_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * desc,LLVMValueRef rgba)364 lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
365                        const struct util_format_description *desc,
366                        LLVMValueRef rgba)
367 {
368    LLVMBuilderRef builder = gallivm->builder;
369    LLVMTypeRef type;
370    LLVMValueRef packed = NULL;
371    LLVMValueRef swizzles[4];
372    LLVMValueRef shifted, casted, scaled, unswizzled;
373    LLVMValueRef shifts[4];
374    LLVMValueRef scales[4];
375    boolean normalized;
376    unsigned i, j;
377 
378    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
379    assert(desc->block.width == 1);
380    assert(desc->block.height == 1);
381 
382    type = LLVMIntTypeInContext(gallivm->context, desc->block.bits);
383 
384    /* Unswizzle the color components into the source vector. */
385    for (i = 0; i < 4; ++i) {
386       for (j = 0; j < 4; ++j) {
387          if (desc->swizzle[j] == i)
388             break;
389       }
390       if (j < 4)
391          swizzles[i] = lp_build_const_int32(gallivm, j);
392       else
393          swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
394    }
395 
396    unswizzled = LLVMBuildShuffleVector(builder, rgba,
397                                        LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)),
398                                        LLVMConstVector(swizzles, 4), "");
399 
400    normalized = FALSE;
401    for (i = 0; i < 4; ++i) {
402       unsigned bits = desc->channel[i].size;
403       unsigned shift = desc->channel[i].shift;
404 
405       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
406          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
407          scales[i] =  LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context));
408       }
409       else {
410          unsigned mask = (1 << bits) - 1;
411 
412          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
413          assert(bits < 32);
414 
415          shifts[i] = lp_build_const_int32(gallivm, shift);
416 
417          if (desc->channel[i].normalized) {
418             scales[i] = lp_build_const_float(gallivm, mask);
419             normalized = TRUE;
420          }
421          else
422             scales[i] = lp_build_const_float(gallivm, 1.0);
423       }
424    }
425 
426    if (normalized)
427       scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
428    else
429       scaled = unswizzled;
430 
431    casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), "");
432 
433    shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
434 
435    /* Bitwise or all components */
436    for (i = 0; i < 4; ++i) {
437       if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
438          LLVMValueRef component = LLVMBuildExtractElement(builder, shifted,
439                                                lp_build_const_int32(gallivm, i), "");
440          if (packed)
441             packed = LLVMBuildOr(builder, packed, component, "");
442          else
443             packed = component;
444       }
445    }
446 
447    if (!packed)
448       packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
449 
450    if (desc->block.bits < 32)
451       packed = LLVMBuildTrunc(builder, packed, type, "");
452 
453    return packed;
454 }
455 
456 
457 
458 
459 /**
460  * Fetch a pixel into a 4 float AoS.
461  *
462  * \param format_desc  describes format of the image we're fetching from
463  * \param aligned  whether the data is guaranteed to be aligned
464  * \param ptr  address of the pixel block (or the texel if uncompressed)
465  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
466  *              these will always be (0, 0).
467  * \return  a 4 element vector with the pixel's RGBA values.
468  */
469 LLVMValueRef
lp_build_fetch_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache)470 lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
471                         const struct util_format_description *format_desc,
472                         struct lp_type type,
473                         boolean aligned,
474                         LLVMValueRef base_ptr,
475                         LLVMValueRef offset,
476                         LLVMValueRef i,
477                         LLVMValueRef j,
478                         LLVMValueRef cache)
479 {
480    LLVMBuilderRef builder = gallivm->builder;
481    unsigned num_pixels = type.length / 4;
482    struct lp_build_context bld;
483 
484    assert(type.length <= LP_MAX_VECTOR_LENGTH);
485    assert(type.length % 4 == 0);
486 
487    lp_build_context_init(&bld, gallivm, type);
488 
489    /*
490     * Trivial case
491     *
492     * The format matches the type (apart of a swizzle) so no need for
493     * scaling or converting.
494     */
495 
496    if (format_matches_type(format_desc, type) &&
497        format_desc->block.bits <= type.width * 4 &&
498        /* XXX this shouldn't be needed */
499        util_is_power_of_two(format_desc->block.bits)) {
500       LLVMValueRef packed;
501       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
502       struct lp_type fetch_type;
503       unsigned vec_len = type.width * type.length;
504 
505       /*
506        * The format matches the type (apart of a swizzle) so no need for
507        * scaling or converting.
508        */
509 
510       fetch_type = lp_type_uint(type.width*4);
511       packed = lp_build_gather(gallivm, type.length/4,
512                                format_desc->block.bits, fetch_type,
513                                aligned, base_ptr, offset, TRUE);
514 
515       assert(format_desc->block.bits <= vec_len);
516       (void) vec_len; /* silence unused var warning for non-debug build */
517 
518       packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
519       return lp_build_format_swizzle_aos(format_desc, &bld, packed);
520    }
521 
522    /*
523     * Bit arithmetic for converting small_unorm to unorm8.
524     *
525     * This misses some opportunities for optimizations (like skipping mask
526     * for the highest channel for instance, or doing bit scaling in parallel
527     * for channels with the same bit width) but it should be passable for
528     * all arithmetic formats.
529     */
530    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
531        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
532        util_format_fits_8unorm(format_desc) &&
533        type.width == 8 && type.norm == 1 && type.sign == 0 &&
534        type.fixed == 0 && type.floating == 0) {
535       LLVMValueRef packed, res = NULL, chans[4], rgba[4];
536       LLVMTypeRef dst_vec_type, conv_vec_type;
537       struct lp_type fetch_type, conv_type;
538       struct lp_build_context bld_conv;
539       unsigned j;
540 
541       fetch_type = lp_type_uint(type.width*4);
542       conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
543       dst_vec_type = lp_build_vec_type(gallivm, type);
544       conv_vec_type = lp_build_vec_type(gallivm, conv_type);
545       lp_build_context_init(&bld_conv, gallivm, conv_type);
546 
547       packed = lp_build_gather(gallivm, type.length/4,
548                                format_desc->block.bits, fetch_type,
549                                aligned, base_ptr, offset, TRUE);
550 
551       assert(format_desc->block.bits * type.length / 4 <=
552              type.width * type.length);
553 
554       packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
555 
556       for (j = 0; j < format_desc->nr_channels; ++j) {
557          unsigned mask = 0;
558          unsigned sa = format_desc->channel[j].shift;
559 
560          mask = (1 << format_desc->channel[j].size) - 1;
561 
562          /* Extract bits from source */
563          chans[j] = LLVMBuildLShr(builder, packed,
564                                   lp_build_const_int_vec(gallivm, conv_type, sa),
565                                   "");
566 
567          chans[j] = LLVMBuildAnd(builder, chans[j],
568                                  lp_build_const_int_vec(gallivm, conv_type, mask),
569                                  "");
570 
571          /* Scale bits */
572          if (type.norm) {
573             chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
574                                      type.width, chans[j], conv_type);
575          }
576       }
577       /*
578        * This is a hacked lp_build_format_swizzle_soa() since we need a
579        * normalized 1 but only 8 bits in a 32bit vector...
580        */
581       for (j = 0; j < 4; ++j) {
582          enum pipe_swizzle swizzle = format_desc->swizzle[j];
583          if (swizzle == PIPE_SWIZZLE_1) {
584             rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
585          } else {
586             rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
587          }
588          if (j == 0) {
589             res = rgba[j];
590          } else {
591             rgba[j] = LLVMBuildShl(builder, rgba[j],
592                                    lp_build_const_int_vec(gallivm, conv_type,
593                                                           j * type.width), "");
594             res = LLVMBuildOr(builder, res, rgba[j], "");
595          }
596       }
597       res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
598 
599       return res;
600    }
601 
602    /*
603     * Bit arithmetic
604     */
605 
606    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
607        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
608         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
609        format_desc->block.width == 1 &&
610        format_desc->block.height == 1 &&
611        /* XXX this shouldn't be needed */
612        util_is_power_of_two(format_desc->block.bits) &&
613        format_desc->block.bits <= 32 &&
614        format_desc->is_bitmask &&
615        !format_desc->is_mixed &&
616        (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
617         format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) &&
618        !format_desc->channel[0].pure_integer) {
619 
620       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
621       LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
622       struct lp_type conv_type;
623       unsigned k, num_conv_src, num_conv_dst;
624 
625       /*
626        * Note this path is generally terrible for fetching multiple pixels.
627        * We should make sure we cannot hit this code path for anything but
628        * single pixels.
629        */
630 
631       /*
632        * Unpack a pixel at a time into a <4 x float> RGBA vector
633        */
634 
635       for (k = 0; k < num_pixels; ++k) {
636          LLVMValueRef packed;
637 
638          packed = lp_build_gather_elem(gallivm, num_pixels,
639                                        format_desc->block.bits, 32, aligned,
640                                        base_ptr, offset, k, FALSE);
641 
642          tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
643                                                   format_desc,
644                                                   packed);
645       }
646 
647       /*
648        * Type conversion.
649        *
650        * TODO: We could avoid floating conversion for integer to
651        * integer conversions.
652        */
653 
654       if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
655          debug_printf("%s: unpacking %s with floating point\n",
656                       __FUNCTION__, format_desc->short_name);
657       }
658 
659       conv_type = lp_float32_vec4_type();
660       num_conv_src = num_pixels;
661       num_conv_dst = 1;
662 
663       if (num_pixels % 8 == 0) {
664          lp_build_concat_n(gallivm, lp_float32_vec4_type(),
665                            tmps, num_pixels, tmps, num_pixels / 2);
666          conv_type.length *= num_pixels / 4;
667          num_conv_src = 4 * num_pixels / 8;
668          if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
669             /*
670              * FIXME: The fast float->unorm path (which is basically
671              * skipping the MIN/MAX which are extremely pointless in any
672              * case) requires that there's 2 destinations...
673              * In any case, we really should make sure we don't hit this
674              * code with multiple pixels for unorm8 dst types, it's
675              * completely hopeless even if we do hit the right conversion.
676              */
677             type.length /= num_pixels / 4;
678             num_conv_dst = num_pixels / 4;
679          }
680       }
681 
682       lp_build_conv(gallivm, conv_type, type,
683                     tmps, num_conv_src, res, num_conv_dst);
684 
685       if (num_pixels % 8 == 0 &&
686           (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
687          lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
688       }
689 
690       return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
691    }
692 
693    /* If all channels are of same type and we are not using half-floats */
694    if (format_desc->is_array &&
695        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) {
696       assert(!format_desc->is_mixed);
697       return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
698    }
699 
700    /*
701     * YUV / subsampled formats
702     */
703 
704    if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
705       struct lp_type tmp_type;
706       LLVMValueRef tmp;
707 
708       memset(&tmp_type, 0, sizeof tmp_type);
709       tmp_type.width = 8;
710       tmp_type.length = num_pixels * 4;
711       tmp_type.norm = TRUE;
712 
713       tmp = lp_build_fetch_subsampled_rgba_aos(gallivm,
714                                                format_desc,
715                                                num_pixels,
716                                                base_ptr,
717                                                offset,
718                                                i, j);
719 
720       lp_build_conv(gallivm,
721                     tmp_type, type,
722                     &tmp, 1, &tmp, 1);
723 
724       return tmp;
725    }
726 
727    /*
728     * s3tc rgb formats
729     */
730 
731    if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
732       struct lp_type tmp_type;
733       LLVMValueRef tmp;
734 
735       memset(&tmp_type, 0, sizeof tmp_type);
736       tmp_type.width = 8;
737       tmp_type.length = num_pixels * 4;
738       tmp_type.norm = TRUE;
739 
740       tmp = lp_build_fetch_cached_texels(gallivm,
741                                          format_desc,
742                                          num_pixels,
743                                          base_ptr,
744                                          offset,
745                                          i, j,
746                                          cache);
747 
748       lp_build_conv(gallivm,
749                     tmp_type, type,
750                     &tmp, 1, &tmp, 1);
751 
752        return tmp;
753    }
754 
755    /*
756     * Fallback to util_format_description::fetch_rgba_8unorm().
757     */
758 
759    if (format_desc->fetch_rgba_8unorm &&
760        !type.floating && type.width == 8 && !type.sign && type.norm) {
761       /*
762        * Fallback to calling util_format_description::fetch_rgba_8unorm.
763        *
764        * This is definitely not the most efficient way of fetching pixels, as
765        * we miss the opportunity to do vectorization, but this it is a
766        * convenient for formats or scenarios for which there was no opportunity
767        * or incentive to optimize.
768        */
769 
770       LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
771       LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
772       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
773       LLVMValueRef function;
774       LLVMValueRef tmp_ptr;
775       LLVMValueRef tmp;
776       LLVMValueRef res;
777       unsigned k;
778 
779       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
780          debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n",
781                       __FUNCTION__, format_desc->short_name);
782       }
783 
784       /*
785        * Declare and bind format_desc->fetch_rgba_8unorm().
786        */
787 
788       {
789          /*
790           * Function to call looks like:
791           *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
792           */
793          LLVMTypeRef ret_type;
794          LLVMTypeRef arg_types[4];
795          LLVMTypeRef function_type;
796 
797          ret_type = LLVMVoidTypeInContext(gallivm->context);
798          arg_types[0] = pi8t;
799          arg_types[1] = pi8t;
800          arg_types[2] = i32t;
801          arg_types[3] = i32t;
802          function_type = LLVMFunctionType(ret_type, arg_types,
803                                           ARRAY_SIZE(arg_types), 0);
804 
805          /* make const pointer for the C fetch_rgba_8unorm function */
806          function = lp_build_const_int_pointer(gallivm,
807             func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
808 
809          /* cast the callee pointer to the function's type */
810          function = LLVMBuildBitCast(builder, function,
811                                      LLVMPointerType(function_type, 0),
812                                      "cast callee");
813       }
814 
815       tmp_ptr = lp_build_alloca(gallivm, i32t, "");
816 
817       res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
818 
819       /*
820        * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
821        * in the SoA vectors.
822        */
823 
824       for (k = 0; k < num_pixels; ++k) {
825          LLVMValueRef index = lp_build_const_int32(gallivm, k);
826          LLVMValueRef args[4];
827 
828          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
829          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
830                                             base_ptr, offset, k);
831 
832          if (num_pixels == 1) {
833             args[2] = i;
834             args[3] = j;
835          }
836          else {
837             args[2] = LLVMBuildExtractElement(builder, i, index, "");
838             args[3] = LLVMBuildExtractElement(builder, j, index, "");
839          }
840 
841          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
842 
843          tmp = LLVMBuildLoad(builder, tmp_ptr, "");
844 
845          if (num_pixels == 1) {
846             res = tmp;
847          }
848          else {
849             res = LLVMBuildInsertElement(builder, res, tmp, index, "");
850          }
851       }
852 
853       /* Bitcast from <n x i32> to <4n x i8> */
854       res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
855 
856       return res;
857    }
858 
859    /*
860     * Fallback to util_format_description::fetch_rgba_float().
861     */
862 
863    if (format_desc->fetch_rgba_float) {
864       /*
865        * Fallback to calling util_format_description::fetch_rgba_float.
866        *
867        * This is definitely not the most efficient way of fetching pixels, as
868        * we miss the opportunity to do vectorization, but this it is a
869        * convenient for formats or scenarios for which there was no opportunity
870        * or incentive to optimize.
871        */
872 
873       LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
874       LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
875       LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
876       LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
877       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
878       LLVMValueRef function;
879       LLVMValueRef tmp_ptr;
880       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
881       LLVMValueRef res;
882       unsigned k;
883 
884       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
885          debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n",
886                       __FUNCTION__, format_desc->short_name);
887       }
888 
889       /*
890        * Declare and bind format_desc->fetch_rgba_float().
891        */
892 
893       {
894          /*
895           * Function to call looks like:
896           *   fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
897           */
898          LLVMTypeRef ret_type;
899          LLVMTypeRef arg_types[4];
900 
901          ret_type = LLVMVoidTypeInContext(gallivm->context);
902          arg_types[0] = pf32t;
903          arg_types[1] = pi8t;
904          arg_types[2] = i32t;
905          arg_types[3] = i32t;
906 
907          function = lp_build_const_func_pointer(gallivm,
908                                                 func_to_pointer((func_pointer) format_desc->fetch_rgba_float),
909                                                 ret_type,
910                                                 arg_types, ARRAY_SIZE(arg_types),
911                                                 format_desc->short_name);
912       }
913 
914       tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
915 
916       /*
917        * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
918        * in the SoA vectors.
919        */
920 
921       for (k = 0; k < num_pixels; ++k) {
922          LLVMValueRef args[4];
923 
924          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
925          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
926                                             base_ptr, offset, k);
927 
928          if (num_pixels == 1) {
929             args[2] = i;
930             args[3] = j;
931          }
932          else {
933             LLVMValueRef index = lp_build_const_int32(gallivm, k);
934             args[2] = LLVMBuildExtractElement(builder, i, index, "");
935             args[3] = LLVMBuildExtractElement(builder, j, index, "");
936          }
937 
938          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
939 
940          tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
941       }
942 
943       lp_build_conv(gallivm,
944                     lp_float32_vec4_type(),
945                     type,
946                     tmps, num_pixels, &res, 1);
947 
948       return res;
949    }
950 
951    assert(!util_format_is_pure_integer(format_desc->format));
952 
953    assert(0);
954    return lp_build_undef(gallivm, type);
955 }
956