1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /**
29  * @file
30  * Texture sampling -- SoA.
31  *
32  * @author Jose Fonseca <jfonseca@vmware.com>
33  * @author Brian Paul <brianp@vmware.com>
34  */
35 
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "pipe/p_shader_tokens.h"
39 #include "util/u_debug.h"
40 #include "util/u_dump.h"
41 #include "util/u_memory.h"
42 #include "util/u_math.h"
43 #include "util/u_format.h"
44 #include "util/u_cpu_detect.h"
45 #include "util/format_rgb9e5.h"
46 #include "lp_bld_debug.h"
47 #include "lp_bld_type.h"
48 #include "lp_bld_const.h"
49 #include "lp_bld_conv.h"
50 #include "lp_bld_arit.h"
51 #include "lp_bld_bitarit.h"
52 #include "lp_bld_logic.h"
53 #include "lp_bld_printf.h"
54 #include "lp_bld_swizzle.h"
55 #include "lp_bld_flow.h"
56 #include "lp_bld_gather.h"
57 #include "lp_bld_format.h"
58 #include "lp_bld_sample.h"
59 #include "lp_bld_sample_aos.h"
60 #include "lp_bld_struct.h"
61 #include "lp_bld_quad.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_intr.h"
64 
65 
66 /**
67  * Generate code to fetch a texel from a texture at int coords (x, y, z).
68  * The computation depends on whether the texture is 1D, 2D or 3D.
69  * The result, texel, will be float vectors:
70  *   texel[0] = red values
71  *   texel[1] = green values
72  *   texel[2] = blue values
73  *   texel[3] = alpha values
74  */
75 static void
lp_build_sample_texel_soa(struct lp_build_sample_context * bld,LLVMValueRef width,LLVMValueRef height,LLVMValueRef depth,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef y_stride,LLVMValueRef z_stride,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,LLVMValueRef texel_out[4])76 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
77                           LLVMValueRef width,
78                           LLVMValueRef height,
79                           LLVMValueRef depth,
80                           LLVMValueRef x,
81                           LLVMValueRef y,
82                           LLVMValueRef z,
83                           LLVMValueRef y_stride,
84                           LLVMValueRef z_stride,
85                           LLVMValueRef data_ptr,
86                           LLVMValueRef mipoffsets,
87                           LLVMValueRef texel_out[4])
88 {
89    const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
90    const unsigned dims = bld->dims;
91    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
92    LLVMBuilderRef builder = bld->gallivm->builder;
93    LLVMValueRef offset;
94    LLVMValueRef i, j;
95    LLVMValueRef use_border = NULL;
96 
97    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
98    if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
99                                               static_state->min_img_filter,
100                                               static_state->mag_img_filter)) {
101       LLVMValueRef b1, b2;
102       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
103       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
104       use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
105    }
106 
107    if (dims >= 2 &&
108        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
109                                               static_state->min_img_filter,
110                                               static_state->mag_img_filter)) {
111       LLVMValueRef b1, b2;
112       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
113       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
114       if (use_border) {
115          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
116          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
117       }
118       else {
119          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
120       }
121    }
122 
123    if (dims == 3 &&
124        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
125                                               static_state->min_img_filter,
126                                               static_state->mag_img_filter)) {
127       LLVMValueRef b1, b2;
128       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
129       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
130       if (use_border) {
131          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
132          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
133       }
134       else {
135          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
136       }
137    }
138 
139    /* convert x,y,z coords to linear offset from start of texture, in bytes */
140    lp_build_sample_offset(&bld->int_coord_bld,
141                           bld->format_desc,
142                           x, y, z, y_stride, z_stride,
143                           &offset, &i, &j);
144    if (mipoffsets) {
145       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
146    }
147 
148    if (use_border) {
149       /* If we can sample the border color, it means that texcoords may
150        * lie outside the bounds of the texture image.  We need to do
151        * something to prevent reading out of bounds and causing a segfault.
152        *
153        * Simply AND the texture coords with !use_border.  This will cause
154        * coords which are out of bounds to become zero.  Zero's guaranteed
155        * to be inside the texture image.
156        */
157       offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
158    }
159 
160    lp_build_fetch_rgba_soa(bld->gallivm,
161                            bld->format_desc,
162                            bld->texel_type, TRUE,
163                            data_ptr, offset,
164                            i, j,
165                            bld->cache,
166                            texel_out);
167 
168    /*
169     * Note: if we find an app which frequently samples the texture border
170     * we might want to implement a true conditional here to avoid sampling
171     * the texture whenever possible (since that's quite a bit of code).
172     * Ex:
173     *   if (use_border) {
174     *      texel = border_color;
175     *   }
176     *   else {
177     *      texel = sample_texture(coord);
178     *   }
179     * As it is now, we always sample the texture, then selectively replace
180     * the texel color results with the border color.
181     */
182 
183    if (use_border) {
184       /* select texel color or border color depending on use_border. */
185       const struct util_format_description *format_desc = bld->format_desc;
186       int chan;
187       struct lp_type border_type = bld->texel_type;
188       border_type.length = 4;
189       /*
190        * Only replace channels which are actually present. The others should
191        * get optimized away eventually by sampler_view swizzle anyway but it's
192        * easier too.
193        */
194       for (chan = 0; chan < 4; chan++) {
195          unsigned chan_s;
196          /* reverse-map channel... */
197          for (chan_s = 0; chan_s < 4; chan_s++) {
198             if (chan_s == format_desc->swizzle[chan]) {
199                break;
200             }
201          }
202          if (chan_s <= 3) {
203             /* use the already clamped color */
204             LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
205             LLVMValueRef border_chan;
206 
207             border_chan = lp_build_extract_broadcast(bld->gallivm,
208                                                      border_type,
209                                                      bld->texel_type,
210                                                      bld->border_color_clamped,
211                                                      idx);
212             texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
213                                               border_chan, texel_out[chan]);
214          }
215       }
216    }
217 }
218 
219 
220 /**
221  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
222  * (Note that with pot sizes could do this much more easily post-scale
223  * with some bit arithmetic.)
224  */
225 static LLVMValueRef
lp_build_coord_mirror(struct lp_build_sample_context * bld,LLVMValueRef coord,boolean posOnly)226 lp_build_coord_mirror(struct lp_build_sample_context *bld,
227                       LLVMValueRef coord, boolean posOnly)
228 {
229    struct lp_build_context *coord_bld = &bld->coord_bld;
230    LLVMValueRef fract;
231    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
232 
233    /*
234     * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
235     * it all works out. (The result is in range [-1, 1.0], negative if
236     * the coord is in the "odd" section, otherwise positive.)
237     */
238 
239    coord = lp_build_mul(coord_bld, coord, half);
240    fract = lp_build_round(coord_bld, coord);
241    fract = lp_build_sub(coord_bld, coord, fract);
242    coord = lp_build_add(coord_bld, fract, fract);
243 
244    if (posOnly) {
245       /*
246        * Theoretically it's not quite 100% accurate because the spec says
247        * that ultimately a scaled coord of -x.0 should map to int coord
248        * -x + 1 with mirroring, not -x (this does not matter for bilinear
249        * filtering).
250        */
251       coord = lp_build_abs(coord_bld, coord);
252       /* kill off NaNs */
253       /* XXX: not safe without arch rounding, fract can be anything. */
254       coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
255                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
256    }
257 
258    return coord;
259 }
260 
261 
262 /**
263  * Helper to compute the first coord and the weight for
264  * linear wrap repeat npot textures
265  */
266 void
lp_build_coord_repeat_npot_linear(struct lp_build_sample_context * bld,LLVMValueRef coord_f,LLVMValueRef length_i,LLVMValueRef length_f,LLVMValueRef * coord0_i,LLVMValueRef * weight_f)267 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
268                                   LLVMValueRef coord_f,
269                                   LLVMValueRef length_i,
270                                   LLVMValueRef length_f,
271                                   LLVMValueRef *coord0_i,
272                                   LLVMValueRef *weight_f)
273 {
274    struct lp_build_context *coord_bld = &bld->coord_bld;
275    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
276    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
277    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
278                                                 int_coord_bld->one);
279    LLVMValueRef mask;
280    /* wrap with normalized floats is just fract */
281    coord_f = lp_build_fract(coord_bld, coord_f);
282    /* mul by size and subtract 0.5 */
283    coord_f = lp_build_mul(coord_bld, coord_f, length_f);
284    coord_f = lp_build_sub(coord_bld, coord_f, half);
285    /*
286     * we avoided the 0.5/length division before the repeat wrap,
287     * now need to fix up edge cases with selects
288     */
289    /*
290     * Note we do a float (unordered) compare so we can eliminate NaNs.
291     * (Otherwise would need fract_safe above).
292     */
293    mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
294                            PIPE_FUNC_LESS, coord_f, coord_bld->zero);
295 
296    /* convert to int, compute lerp weight */
297    lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
298    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
299 }
300 
301 
302 /**
303  * Build LLVM code for texture wrap mode for linear filtering.
304  * \param x0_out  returns first integer texcoord
305  * \param x1_out  returns second integer texcoord
306  * \param weight_out  returns linear interpolation weight
307  */
308 static void
lp_build_sample_wrap_linear(struct lp_build_sample_context * bld,boolean is_gather,LLVMValueRef coord,LLVMValueRef length,LLVMValueRef length_f,LLVMValueRef offset,boolean is_pot,unsigned wrap_mode,LLVMValueRef * x0_out,LLVMValueRef * x1_out,LLVMValueRef * weight_out)309 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
310                             boolean is_gather,
311                             LLVMValueRef coord,
312                             LLVMValueRef length,
313                             LLVMValueRef length_f,
314                             LLVMValueRef offset,
315                             boolean is_pot,
316                             unsigned wrap_mode,
317                             LLVMValueRef *x0_out,
318                             LLVMValueRef *x1_out,
319                             LLVMValueRef *weight_out)
320 {
321    struct lp_build_context *coord_bld = &bld->coord_bld;
322    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
323    LLVMBuilderRef builder = bld->gallivm->builder;
324    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
325    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
326    LLVMValueRef coord0, coord1, weight;
327 
328    switch(wrap_mode) {
329    case PIPE_TEX_WRAP_REPEAT:
330       if (is_pot) {
331          /* mul by size and subtract 0.5 */
332          coord = lp_build_mul(coord_bld, coord, length_f);
333          coord = lp_build_sub(coord_bld, coord, half);
334          if (offset) {
335             offset = lp_build_int_to_float(coord_bld, offset);
336             coord = lp_build_add(coord_bld, coord, offset);
337          }
338          /* convert to int, compute lerp weight */
339          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
340          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
341          /* repeat wrap */
342          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
343          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
344       }
345       else {
346          LLVMValueRef mask;
347          if (offset) {
348             offset = lp_build_int_to_float(coord_bld, offset);
349             offset = lp_build_div(coord_bld, offset, length_f);
350             coord = lp_build_add(coord_bld, coord, offset);
351          }
352          lp_build_coord_repeat_npot_linear(bld, coord,
353                                            length, length_f,
354                                            &coord0, &weight);
355          mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
356                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
357          coord1 = LLVMBuildAnd(builder,
358                                lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
359                                mask, "");
360       }
361       break;
362 
363    case PIPE_TEX_WRAP_CLAMP:
364       if (bld->static_sampler_state->normalized_coords) {
365          /* scale coord to length */
366          coord = lp_build_mul(coord_bld, coord, length_f);
367       }
368       if (offset) {
369          offset = lp_build_int_to_float(coord_bld, offset);
370          coord = lp_build_add(coord_bld, coord, offset);
371       }
372 
373       /*
374        * clamp to [0, length]
375        *
376        * Unlike some other wrap modes, this should be correct for gather
377        * too. GL_CLAMP explicitly does this clamp on the coord prior to
378        * actual wrapping (which is per sample).
379        */
380       coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
381 
382       coord = lp_build_sub(coord_bld, coord, half);
383 
384       /* convert to int, compute lerp weight */
385       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
386       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
387       break;
388 
389    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
390       {
391          struct lp_build_context abs_coord_bld = bld->coord_bld;
392          abs_coord_bld.type.sign = FALSE;
393 
394          if (bld->static_sampler_state->normalized_coords) {
395             /* mul by tex size */
396             coord = lp_build_mul(coord_bld, coord, length_f);
397          }
398          if (offset) {
399             offset = lp_build_int_to_float(coord_bld, offset);
400             coord = lp_build_add(coord_bld, coord, offset);
401          }
402 
403          /* clamp to length max */
404          coord = lp_build_min_ext(coord_bld, coord, length_f,
405                                   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
406          if (!is_gather) {
407             /* subtract 0.5 */
408             coord = lp_build_sub(coord_bld, coord, half);
409             /* clamp to [0, length - 0.5] */
410             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
411             /* convert to int, compute lerp weight */
412             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
413             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
414          } else {
415             /*
416              * The non-gather path will end up with coords 0, 1 if coord was
417              * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
418              * really matter what the second coord is). But for gather, we
419              * really need to end up with coords 0, 0.
420              */
421             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
422             coord0 = lp_build_sub(coord_bld, coord, half);
423             coord1 = lp_build_add(coord_bld, coord, half);
424             /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
425             coord0 = lp_build_itrunc(coord_bld, coord0);
426             coord1 = lp_build_itrunc(coord_bld, coord1);
427             weight = coord_bld->undef;
428          }
429          /* coord1 = min(coord1, length-1) */
430          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
431          break;
432       }
433 
434    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
435       if (bld->static_sampler_state->normalized_coords) {
436          /* scale coord to length */
437          coord = lp_build_mul(coord_bld, coord, length_f);
438       }
439       if (offset) {
440          offset = lp_build_int_to_float(coord_bld, offset);
441          coord = lp_build_add(coord_bld, coord, offset);
442       }
443       /*
444        * We don't need any clamp. Technically, for very large (pos or neg)
445        * (or infinite) values, clamp against [-length, length] would be
446        * correct, but we don't need to guarantee any specific
447        * result for such coords (the ifloor will be undefined, but for modes
448        * requiring border all resulting coords are safe).
449        */
450       coord = lp_build_sub(coord_bld, coord, half);
451       /* convert to int, compute lerp weight */
452       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
453       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
454       break;
455 
456    case PIPE_TEX_WRAP_MIRROR_REPEAT:
457       if (offset) {
458          offset = lp_build_int_to_float(coord_bld, offset);
459          offset = lp_build_div(coord_bld, offset, length_f);
460          coord = lp_build_add(coord_bld, coord, offset);
461       }
462       if (!is_gather) {
463          /* compute mirror function */
464          coord = lp_build_coord_mirror(bld, coord, TRUE);
465 
466          /* scale coord to length */
467          coord = lp_build_mul(coord_bld, coord, length_f);
468          coord = lp_build_sub(coord_bld, coord, half);
469 
470          /* convert to int, compute lerp weight */
471          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
472          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
473 
474          /* coord0 = max(coord0, 0) */
475          coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
476          /* coord1 = min(coord1, length-1) */
477          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
478       } else {
479          /*
480           * This is pretty reasonable in the end,  all what the tests care
481           * about is nasty edge cases (scaled coords x.5, so the individual
482           * coords are actually integers, which is REALLY tricky to get right
483           * due to this working differently both for negative numbers as well
484           * as for even/odd cases). But with enough magic it's not too complex
485           * after all.
486           * Maybe should try a bit arithmetic one though for POT textures...
487           */
488          LLVMValueRef isNeg;
489          /*
490           * Wrapping just once still works, even though it means we can
491           * get "wrong" sign due to performing mirror in the middle of the
492           * two coords (because this can only happen very near the odd/even
493           * edges, so both coords will actually end up as 0 or length - 1
494           * in the end).
495           * For GL4 gather with per-sample offsets we'd need to the mirroring
496           * per coord too.
497           */
498          coord = lp_build_coord_mirror(bld, coord, FALSE);
499          coord = lp_build_mul(coord_bld, coord, length_f);
500 
501          /*
502           * NaNs should be safe here, we'll do away with them with
503           * the ones' complement plus min.
504           */
505          coord0 = lp_build_sub(coord_bld, coord, half);
506          coord0 = lp_build_ifloor(coord_bld, coord0);
507          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
508          /* ones complement for neg numbers (mirror(negX) = X - 1)  */
509          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
510                               coord0, int_coord_bld->zero);
511          coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
512          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
513                               coord1, int_coord_bld->zero);
514          coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
515          coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
516          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
517 
518          weight = coord_bld->undef;
519       }
520       break;
521 
522    case PIPE_TEX_WRAP_MIRROR_CLAMP:
523       if (bld->static_sampler_state->normalized_coords) {
524          /* scale coord to length */
525          coord = lp_build_mul(coord_bld, coord, length_f);
526       }
527       if (offset) {
528          offset = lp_build_int_to_float(coord_bld, offset);
529          coord = lp_build_add(coord_bld, coord, offset);
530       }
531       /*
532        * XXX: probably not correct for gather, albeit I'm not
533        * entirely sure as it's poorly specified. The wrapping looks
534        * correct according to the spec which is against gl 1.2.1,
535        * however negative values will be swapped - gl re-specified
536        * wrapping with newer versions (no more pre-clamp except with
537        * GL_CLAMP).
538        */
539       coord = lp_build_abs(coord_bld, coord);
540 
541       /* clamp to [0, length] */
542       coord = lp_build_min_ext(coord_bld, coord, length_f,
543                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
544 
545       coord = lp_build_sub(coord_bld, coord, half);
546 
547       /* convert to int, compute lerp weight */
548       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
549       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
550       break;
551 
552    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
553       {
554          struct lp_build_context abs_coord_bld = bld->coord_bld;
555          abs_coord_bld.type.sign = FALSE;
556 
557          if (bld->static_sampler_state->normalized_coords) {
558             /* scale coord to length */
559             coord = lp_build_mul(coord_bld, coord, length_f);
560          }
561          if (offset) {
562             offset = lp_build_int_to_float(coord_bld, offset);
563             coord = lp_build_add(coord_bld, coord, offset);
564          }
565          if (!is_gather) {
566             coord = lp_build_abs(coord_bld, coord);
567 
568             /* clamp to length max */
569             coord = lp_build_min_ext(coord_bld, coord, length_f,
570                                      GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
571             /* subtract 0.5 */
572             coord = lp_build_sub(coord_bld, coord, half);
573             /* clamp to [0, length - 0.5] */
574             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
575 
576             /* convert to int, compute lerp weight */
577             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
578             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
579             /* coord1 = min(coord1, length-1) */
580             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
581          } else {
582             /*
583              * The non-gather path will swap coord0/1 if coord was negative,
584              * which is ok for filtering since the filter weight matches
585              * accordingly. Also, if coord is close to zero, coord0/1 will
586              * be 0 and 1, instead of 0 and 0 (again ok due to filter
587              * weight being 0.0). Both issues need to be fixed for gather.
588              */
589             LLVMValueRef isNeg;
590 
591             /*
592              * Actually wanted to cheat here and use:
593              * coord1 = lp_build_iround(coord_bld, coord);
594              * but it's not good enough for some tests (even piglit
595              * textureGather is set up in a way so the coords area always
596              * .5, that is right at the crossover points).
597              * So do ordinary sub/floor, then do ones' complement
598              * for negative numbers.
599              * (Note can't just do sub|add/abs/itrunc per coord neither -
600              * because the spec demands that mirror(3.0) = 3 but
601              * mirror(-3.0) = 2.)
602              */
603             coord = lp_build_sub(coord_bld, coord, half);
604             coord0 = lp_build_ifloor(coord_bld, coord);
605             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
606             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
607                                  int_coord_bld->zero);
608             coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
609             coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
610 
611             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
612                                  int_coord_bld->zero);
613             coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
614             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
615 
616             weight = coord_bld->undef;
617          }
618       }
619       break;
620 
621    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
622       {
623          if (bld->static_sampler_state->normalized_coords) {
624             /* scale coord to length */
625             coord = lp_build_mul(coord_bld, coord, length_f);
626          }
627          if (offset) {
628             offset = lp_build_int_to_float(coord_bld, offset);
629             coord = lp_build_add(coord_bld, coord, offset);
630          }
631          /*
632           * XXX: probably not correct for gather due to swapped
633           * order if coord is negative (same rationale as for
634           * MIRROR_CLAMP).
635           */
636          coord = lp_build_abs(coord_bld, coord);
637 
638          /*
639           * We don't need any clamp. Technically, for very large
640           * (or infinite) values, clamp against length would be
641           * correct, but we don't need to guarantee any specific
642           * result for such coords (the ifloor will be undefined, but
643           * for modes requiring border all resulting coords are safe).
644           */
645          coord = lp_build_sub(coord_bld, coord, half);
646 
647          /* convert to int, compute lerp weight */
648          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
649          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
650       }
651       break;
652 
653    default:
654       assert(0);
655       coord0 = NULL;
656       coord1 = NULL;
657       weight = NULL;
658    }
659 
660    *x0_out = coord0;
661    *x1_out = coord1;
662    *weight_out = weight;
663 }
664 
665 
666 /**
667  * Build LLVM code for texture wrap mode for nearest filtering.
668  * \param coord  the incoming texcoord (nominally in [0,1])
669  * \param length  the texture size along one dimension, as int vector
670  * \param length_f  the texture size along one dimension, as float vector
671  * \param offset  texel offset along one dimension (as int vector)
672  * \param is_pot  if TRUE, length is a power of two
673  * \param wrap_mode  one of PIPE_TEX_WRAP_x
674  */
675 static LLVMValueRef
lp_build_sample_wrap_nearest(struct lp_build_sample_context * bld,LLVMValueRef coord,LLVMValueRef length,LLVMValueRef length_f,LLVMValueRef offset,boolean is_pot,unsigned wrap_mode)676 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
677                              LLVMValueRef coord,
678                              LLVMValueRef length,
679                              LLVMValueRef length_f,
680                              LLVMValueRef offset,
681                              boolean is_pot,
682                              unsigned wrap_mode)
683 {
684    struct lp_build_context *coord_bld = &bld->coord_bld;
685    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
686    LLVMBuilderRef builder = bld->gallivm->builder;
687    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
688    LLVMValueRef icoord;
689 
690    switch(wrap_mode) {
691    case PIPE_TEX_WRAP_REPEAT:
692       if (is_pot) {
693          coord = lp_build_mul(coord_bld, coord, length_f);
694          icoord = lp_build_ifloor(coord_bld, coord);
695          if (offset) {
696             icoord = lp_build_add(int_coord_bld, icoord, offset);
697          }
698          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
699       }
700       else {
701           if (offset) {
702              offset = lp_build_int_to_float(coord_bld, offset);
703              offset = lp_build_div(coord_bld, offset, length_f);
704              coord = lp_build_add(coord_bld, coord, offset);
705           }
706           /* take fraction, unnormalize */
707           coord = lp_build_fract_safe(coord_bld, coord);
708           coord = lp_build_mul(coord_bld, coord, length_f);
709           icoord = lp_build_itrunc(coord_bld, coord);
710       }
711       break;
712 
713    case PIPE_TEX_WRAP_CLAMP:
714    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
715       if (bld->static_sampler_state->normalized_coords) {
716          /* scale coord to length */
717          coord = lp_build_mul(coord_bld, coord, length_f);
718       }
719 
720       if (offset) {
721          offset = lp_build_int_to_float(coord_bld, offset);
722          coord = lp_build_add(coord_bld, coord, offset);
723       }
724       /* floor */
725       /* use itrunc instead since we clamp to 0 anyway */
726       icoord = lp_build_itrunc(coord_bld, coord);
727 
728       /* clamp to [0, length - 1]. */
729       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
730                               length_minus_one);
731       break;
732 
733    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
734       if (bld->static_sampler_state->normalized_coords) {
735          /* scale coord to length */
736          coord = lp_build_mul(coord_bld, coord, length_f);
737       }
738       /* no clamp necessary, border masking will handle this */
739       icoord = lp_build_ifloor(coord_bld, coord);
740       if (offset) {
741          icoord = lp_build_add(int_coord_bld, icoord, offset);
742       }
743       break;
744 
745    case PIPE_TEX_WRAP_MIRROR_REPEAT:
746       if (offset) {
747          offset = lp_build_int_to_float(coord_bld, offset);
748          offset = lp_build_div(coord_bld, offset, length_f);
749          coord = lp_build_add(coord_bld, coord, offset);
750       }
751       /* compute mirror function */
752       coord = lp_build_coord_mirror(bld, coord, TRUE);
753 
754       /* scale coord to length */
755       assert(bld->static_sampler_state->normalized_coords);
756       coord = lp_build_mul(coord_bld, coord, length_f);
757 
758       /* itrunc == ifloor here */
759       icoord = lp_build_itrunc(coord_bld, coord);
760 
761       /* clamp to [0, length - 1] */
762       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
763       break;
764 
765    case PIPE_TEX_WRAP_MIRROR_CLAMP:
766    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
767       if (bld->static_sampler_state->normalized_coords) {
768          /* scale coord to length */
769          coord = lp_build_mul(coord_bld, coord, length_f);
770       }
771       if (offset) {
772          offset = lp_build_int_to_float(coord_bld, offset);
773          coord = lp_build_add(coord_bld, coord, offset);
774       }
775       coord = lp_build_abs(coord_bld, coord);
776 
777       /* itrunc == ifloor here */
778       icoord = lp_build_itrunc(coord_bld, coord);
779       /*
780        * Use unsigned min due to possible undef values (NaNs, overflow)
781        */
782       {
783          struct lp_build_context abs_coord_bld = *int_coord_bld;
784          abs_coord_bld.type.sign = FALSE;
785          /* clamp to [0, length - 1] */
786          icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
787       }
788       break;
789 
790    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
791       if (bld->static_sampler_state->normalized_coords) {
792          /* scale coord to length */
793          coord = lp_build_mul(coord_bld, coord, length_f);
794       }
795       if (offset) {
796          offset = lp_build_int_to_float(coord_bld, offset);
797          coord = lp_build_add(coord_bld, coord, offset);
798       }
799       coord = lp_build_abs(coord_bld, coord);
800 
801       /* itrunc == ifloor here */
802       icoord = lp_build_itrunc(coord_bld, coord);
803       break;
804 
805    default:
806       assert(0);
807       icoord = NULL;
808    }
809 
810    return icoord;
811 }
812 
813 
814 /**
815  * Do shadow test/comparison.
816  * \param p shadow ref value
817  * \param texel  the texel to compare against
818  */
819 static LLVMValueRef
lp_build_sample_comparefunc(struct lp_build_sample_context * bld,LLVMValueRef p,LLVMValueRef texel)820 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
821                             LLVMValueRef p,
822                             LLVMValueRef texel)
823 {
824    struct lp_build_context *texel_bld = &bld->texel_bld;
825    LLVMValueRef res;
826 
827    if (0) {
828       //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
829       lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
830    }
831 
832    /* result = (p FUNC texel) ? 1 : 0 */
833    /*
834     * honor d3d10 floating point rules here, which state that comparisons
835     * are ordered except NOT_EQUAL which is unordered.
836     */
837    if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
838       res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
839                                  p, texel);
840    }
841    else {
842       res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
843                          p, texel);
844    }
845    return res;
846 }
847 
848 
849 /**
850  * Generate code to sample a mipmap level with nearest filtering.
851  * If sampling a cube texture, r = cube face in [0,5].
852  */
853 static void
lp_build_sample_image_nearest(struct lp_build_sample_context * bld,LLVMValueRef size,LLVMValueRef row_stride_vec,LLVMValueRef img_stride_vec,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef colors_out[4])854 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
855                               LLVMValueRef size,
856                               LLVMValueRef row_stride_vec,
857                               LLVMValueRef img_stride_vec,
858                               LLVMValueRef data_ptr,
859                               LLVMValueRef mipoffsets,
860                               const LLVMValueRef *coords,
861                               const LLVMValueRef *offsets,
862                               LLVMValueRef colors_out[4])
863 {
864    const unsigned dims = bld->dims;
865    LLVMValueRef width_vec;
866    LLVMValueRef height_vec;
867    LLVMValueRef depth_vec;
868    LLVMValueRef flt_size;
869    LLVMValueRef flt_width_vec;
870    LLVMValueRef flt_height_vec;
871    LLVMValueRef flt_depth_vec;
872    LLVMValueRef x, y = NULL, z = NULL;
873 
874    lp_build_extract_image_sizes(bld,
875                                 &bld->int_size_bld,
876                                 bld->int_coord_type,
877                                 size,
878                                 &width_vec, &height_vec, &depth_vec);
879 
880    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
881 
882    lp_build_extract_image_sizes(bld,
883                                 &bld->float_size_bld,
884                                 bld->coord_type,
885                                 flt_size,
886                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
887 
888    /*
889     * Compute integer texcoords.
890     */
891    x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
892                                     flt_width_vec, offsets[0],
893                                     bld->static_texture_state->pot_width,
894                                     bld->static_sampler_state->wrap_s);
895    lp_build_name(x, "tex.x.wrapped");
896 
897    if (dims >= 2) {
898       y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
899                                        flt_height_vec, offsets[1],
900                                        bld->static_texture_state->pot_height,
901                                        bld->static_sampler_state->wrap_t);
902       lp_build_name(y, "tex.y.wrapped");
903 
904       if (dims == 3) {
905          z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
906                                           flt_depth_vec, offsets[2],
907                                           bld->static_texture_state->pot_depth,
908                                           bld->static_sampler_state->wrap_r);
909          lp_build_name(z, "tex.z.wrapped");
910       }
911    }
912    if (has_layer_coord(bld->static_texture_state->target)) {
913       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
914          /* add cube layer to face */
915          z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
916       }
917       else {
918          z = coords[2];
919       }
920       lp_build_name(z, "tex.z.layer");
921    }
922 
923    /*
924     * Get texture colors.
925     */
926    lp_build_sample_texel_soa(bld,
927                              width_vec, height_vec, depth_vec,
928                              x, y, z,
929                              row_stride_vec, img_stride_vec,
930                              data_ptr, mipoffsets, colors_out);
931 
932    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
933       LLVMValueRef cmpval;
934       cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
935       /* this is really just a AND 1.0, cmpval but llvm is clever enough */
936       colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
937                                       bld->texel_bld.one, bld->texel_bld.zero);
938       colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
939    }
940 
941 }
942 
943 
944 /**
945  * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
946  */
947 static LLVMValueRef
lp_build_masklerp(struct lp_build_context * bld,LLVMValueRef weight,LLVMValueRef mask0,LLVMValueRef mask1)948 lp_build_masklerp(struct lp_build_context *bld,
949                  LLVMValueRef weight,
950                  LLVMValueRef mask0,
951                  LLVMValueRef mask1)
952 {
953    struct gallivm_state *gallivm = bld->gallivm;
954    LLVMBuilderRef builder = gallivm->builder;
955    LLVMValueRef weight2;
956 
957    weight2 = lp_build_sub(bld, bld->one, weight);
958    weight = LLVMBuildBitCast(builder, weight,
959                               lp_build_int_vec_type(gallivm, bld->type), "");
960    weight2 = LLVMBuildBitCast(builder, weight2,
961                               lp_build_int_vec_type(gallivm, bld->type), "");
962    weight = LLVMBuildAnd(builder, weight, mask1, "");
963    weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
964    weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
965    weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
966    return lp_build_add(bld, weight, weight2);
967 }
968 
969 /**
970  * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
971  */
972 static LLVMValueRef
lp_build_masklerp2d(struct lp_build_context * bld,LLVMValueRef weight0,LLVMValueRef weight1,LLVMValueRef mask00,LLVMValueRef mask01,LLVMValueRef mask10,LLVMValueRef mask11)973 lp_build_masklerp2d(struct lp_build_context *bld,
974                     LLVMValueRef weight0,
975                     LLVMValueRef weight1,
976                     LLVMValueRef mask00,
977                     LLVMValueRef mask01,
978                     LLVMValueRef mask10,
979                     LLVMValueRef mask11)
980 {
981    LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
982    LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
983    return lp_build_lerp(bld, weight1, val0, val1, 0);
984 }
985 
986 /*
987  * this is a bit excessive code for something OpenGL just recommends
988  * but does not require.
989  */
990 #define ACCURATE_CUBE_CORNERS 1
991 
992 /**
993  * Generate code to sample a mipmap level with linear filtering.
994  * If sampling a cube texture, r = cube face in [0,5].
995  * If linear_mask is present, only pixels having their mask set
996  * will receive linear filtering, the rest will use nearest.
997  */
998 static void
lp_build_sample_image_linear(struct lp_build_sample_context * bld,boolean is_gather,LLVMValueRef size,LLVMValueRef linear_mask,LLVMValueRef row_stride_vec,LLVMValueRef img_stride_vec,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef colors_out[4])999 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1000                              boolean is_gather,
1001                              LLVMValueRef size,
1002                              LLVMValueRef linear_mask,
1003                              LLVMValueRef row_stride_vec,
1004                              LLVMValueRef img_stride_vec,
1005                              LLVMValueRef data_ptr,
1006                              LLVMValueRef mipoffsets,
1007                              const LLVMValueRef *coords,
1008                              const LLVMValueRef *offsets,
1009                              LLVMValueRef colors_out[4])
1010 {
1011    LLVMBuilderRef builder = bld->gallivm->builder;
1012    struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1013    struct lp_build_context *coord_bld = &bld->coord_bld;
1014    struct lp_build_context *texel_bld = &bld->texel_bld;
1015    const unsigned dims = bld->dims;
1016    LLVMValueRef width_vec;
1017    LLVMValueRef height_vec;
1018    LLVMValueRef depth_vec;
1019    LLVMValueRef flt_size;
1020    LLVMValueRef flt_width_vec;
1021    LLVMValueRef flt_height_vec;
1022    LLVMValueRef flt_depth_vec;
1023    LLVMValueRef fall_off[4], have_corners;
1024    LLVMValueRef z1 = NULL;
1025    LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1026    LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1027    LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1028    LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1029    LLVMValueRef xs[4], ys[4], zs[4];
1030    LLVMValueRef neighbors[2][2][4];
1031    int chan, texel_index;
1032    boolean seamless_cube_filter, accurate_cube_corners;
1033    unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1034 
1035    seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1036                            bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1037                           bld->static_sampler_state->seamless_cube_map;
1038 
1039    accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
1040 
1041    lp_build_extract_image_sizes(bld,
1042                                 &bld->int_size_bld,
1043                                 bld->int_coord_type,
1044                                 size,
1045                                 &width_vec, &height_vec, &depth_vec);
1046 
1047    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1048 
1049    lp_build_extract_image_sizes(bld,
1050                                 &bld->float_size_bld,
1051                                 bld->coord_type,
1052                                 flt_size,
1053                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1054 
1055    /*
1056     * Compute integer texcoords.
1057     */
1058 
1059    if (!seamless_cube_filter) {
1060       lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1061                                   flt_width_vec, offsets[0],
1062                                   bld->static_texture_state->pot_width,
1063                                   bld->static_sampler_state->wrap_s,
1064                                   &x00, &x01, &s_fpart);
1065       lp_build_name(x00, "tex.x0.wrapped");
1066       lp_build_name(x01, "tex.x1.wrapped");
1067       x10 = x00;
1068       x11 = x01;
1069 
1070       if (dims >= 2) {
1071          lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1072                                      flt_height_vec, offsets[1],
1073                                      bld->static_texture_state->pot_height,
1074                                      bld->static_sampler_state->wrap_t,
1075                                      &y00, &y10, &t_fpart);
1076          lp_build_name(y00, "tex.y0.wrapped");
1077          lp_build_name(y10, "tex.y1.wrapped");
1078          y01 = y00;
1079          y11 = y10;
1080 
1081          if (dims == 3) {
1082             lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1083                                         flt_depth_vec, offsets[2],
1084                                         bld->static_texture_state->pot_depth,
1085                                         bld->static_sampler_state->wrap_r,
1086                                         &z00, &z1, &r_fpart);
1087             z01 = z10 = z11 = z00;
1088             lp_build_name(z00, "tex.z0.wrapped");
1089             lp_build_name(z1, "tex.z1.wrapped");
1090          }
1091       }
1092       if (has_layer_coord(bld->static_texture_state->target)) {
1093          if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1094             /* add cube layer to face */
1095             z00 = z01 = z10 = z11 = z1 =
1096                lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1097          }
1098          else {
1099             z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
1100          }
1101          lp_build_name(z00, "tex.z0.layer");
1102          lp_build_name(z1, "tex.z1.layer");
1103       }
1104    }
1105    else {
1106       struct lp_build_if_state edge_if;
1107       LLVMTypeRef int1t;
1108       LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1109       LLVMValueRef coord0, coord1, have_edge, have_corner;
1110       LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1111       LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1112       LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1113       LLVMValueRef face = coords[2];
1114       LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1115       LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1116       /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1117       height_vec = width_vec;
1118       flt_height_vec = flt_width_vec;
1119 
1120       /* XXX the overflow logic is actually sort of duplicated with trilinear,
1121        * since an overflow in one mip should also have a corresponding overflow
1122        * in another.
1123        */
1124       /* should always have normalized coords, and offsets are undefined */
1125       assert(bld->static_sampler_state->normalized_coords);
1126       /*
1127        * The coords should all be between [0,1] however we can have NaNs,
1128        * which will wreak havoc. In particular the y1_clamped value below
1129        * can be -INT_MAX (on x86) and be propagated right through (probably
1130        * other values might be bogus in the end too).
1131        * So kill off the NaNs here.
1132        */
1133       coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1134                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1135       coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1136       /* instead of clamp, build mask if overflowed */
1137       coord0 = lp_build_sub(coord_bld, coord0, half);
1138       /* convert to int, compute lerp weight */
1139       /* not ideal with AVX (and no AVX2) */
1140       lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1141       x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1142       coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1143                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1144       coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1145       coord1 = lp_build_sub(coord_bld, coord1, half);
1146       lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1147       y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1148 
1149       fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1150       fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1151       fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1152       fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1153 
1154       fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1155       fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1156       have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1157       have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1158 
1159       /* needed for accurate corner filtering branch later, rely on 0 init */
1160       int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1161       have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1162 
1163       for (texel_index = 0; texel_index < 4; texel_index++) {
1164          xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1165          ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1166          zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1167       }
1168 
1169       lp_build_if(&edge_if, bld->gallivm, have_edge);
1170 
1171       have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1172       have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1173       LLVMBuildStore(builder, have_corner, have_corners);
1174 
1175       /*
1176        * Need to feed clamped values here for cheap corner handling,
1177        * but only for y coord (as when falling off both edges we only
1178        * fall off the x one) - this should be sufficient.
1179        */
1180       y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1181       y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1182 
1183       /*
1184        * Get all possible new coords.
1185        */
1186       lp_build_cube_new_coords(ivec_bld, face,
1187                                x0, x1, y0_clamped, y1_clamped,
1188                                length_minus_one,
1189                                new_faces, new_xcoords, new_ycoords);
1190 
1191       /* handle fall off x-, x+ direction */
1192       /* determine new coords, face (not both fall_off vars can be true at same time) */
1193       x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1194       y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1195       x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1196       y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1197       x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1198       y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1199       x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1200       y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1201 
1202       z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1203       z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1204 
1205       /* handle fall off y-, y+ direction */
1206       /*
1207        * Cheap corner logic: just hack up things so a texel doesn't fall
1208        * off both sides (which means filter weights will be wrong but we'll only
1209        * use valid texels in the filter).
1210        * This means however (y) coords must additionally be clamped (see above).
1211        * This corner handling should be fully OpenGL (but not d3d10) compliant.
1212        */
1213       fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1214       fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1215       fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1216       fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1217 
1218       x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1219       y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1220       x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1221       y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1222       x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1223       y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1224       x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1225       y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1226 
1227       z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1228       z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1229       z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1230       z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1231 
1232       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1233          /* now can add cube layer to face (per sample) */
1234          z00 = lp_build_add(ivec_bld, z00, coords[3]);
1235          z01 = lp_build_add(ivec_bld, z01, coords[3]);
1236          z10 = lp_build_add(ivec_bld, z10, coords[3]);
1237          z11 = lp_build_add(ivec_bld, z11, coords[3]);
1238       }
1239 
1240       LLVMBuildStore(builder, x00, xs[0]);
1241       LLVMBuildStore(builder, x01, xs[1]);
1242       LLVMBuildStore(builder, x10, xs[2]);
1243       LLVMBuildStore(builder, x11, xs[3]);
1244       LLVMBuildStore(builder, y00, ys[0]);
1245       LLVMBuildStore(builder, y01, ys[1]);
1246       LLVMBuildStore(builder, y10, ys[2]);
1247       LLVMBuildStore(builder, y11, ys[3]);
1248       LLVMBuildStore(builder, z00, zs[0]);
1249       LLVMBuildStore(builder, z01, zs[1]);
1250       LLVMBuildStore(builder, z10, zs[2]);
1251       LLVMBuildStore(builder, z11, zs[3]);
1252 
1253       lp_build_else(&edge_if);
1254 
1255       LLVMBuildStore(builder, x0, xs[0]);
1256       LLVMBuildStore(builder, x1, xs[1]);
1257       LLVMBuildStore(builder, x0, xs[2]);
1258       LLVMBuildStore(builder, x1, xs[3]);
1259       LLVMBuildStore(builder, y0, ys[0]);
1260       LLVMBuildStore(builder, y0, ys[1]);
1261       LLVMBuildStore(builder, y1, ys[2]);
1262       LLVMBuildStore(builder, y1, ys[3]);
1263       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1264          LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1265          LLVMBuildStore(builder, cube_layer, zs[0]);
1266          LLVMBuildStore(builder, cube_layer, zs[1]);
1267          LLVMBuildStore(builder, cube_layer, zs[2]);
1268          LLVMBuildStore(builder, cube_layer, zs[3]);
1269       }
1270       else {
1271          LLVMBuildStore(builder, face, zs[0]);
1272          LLVMBuildStore(builder, face, zs[1]);
1273          LLVMBuildStore(builder, face, zs[2]);
1274          LLVMBuildStore(builder, face, zs[3]);
1275       }
1276 
1277       lp_build_endif(&edge_if);
1278 
1279       x00 = LLVMBuildLoad(builder, xs[0], "");
1280       x01 = LLVMBuildLoad(builder, xs[1], "");
1281       x10 = LLVMBuildLoad(builder, xs[2], "");
1282       x11 = LLVMBuildLoad(builder, xs[3], "");
1283       y00 = LLVMBuildLoad(builder, ys[0], "");
1284       y01 = LLVMBuildLoad(builder, ys[1], "");
1285       y10 = LLVMBuildLoad(builder, ys[2], "");
1286       y11 = LLVMBuildLoad(builder, ys[3], "");
1287       z00 = LLVMBuildLoad(builder, zs[0], "");
1288       z01 = LLVMBuildLoad(builder, zs[1], "");
1289       z10 = LLVMBuildLoad(builder, zs[2], "");
1290       z11 = LLVMBuildLoad(builder, zs[3], "");
1291    }
1292 
1293    if (linear_mask) {
1294       /*
1295        * Whack filter weights into place. Whatever texel had more weight is
1296        * the one which should have been selected by nearest filtering hence
1297        * just use 100% weight for it.
1298        */
1299       struct lp_build_context *c_bld = &bld->coord_bld;
1300       LLVMValueRef w1_mask, w1_weight;
1301       LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1302 
1303       w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1304       /* this select is really just a "and" */
1305       w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1306       s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1307       if (dims >= 2) {
1308          w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1309          w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1310          t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1311          if (dims == 3) {
1312             w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1313             w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1314             r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1315          }
1316       }
1317    }
1318 
1319    /*
1320     * Get texture colors.
1321     */
1322    /* get x0/x1 texels */
1323    lp_build_sample_texel_soa(bld,
1324                              width_vec, height_vec, depth_vec,
1325                              x00, y00, z00,
1326                              row_stride_vec, img_stride_vec,
1327                              data_ptr, mipoffsets, neighbors[0][0]);
1328    lp_build_sample_texel_soa(bld,
1329                              width_vec, height_vec, depth_vec,
1330                              x01, y01, z01,
1331                              row_stride_vec, img_stride_vec,
1332                              data_ptr, mipoffsets, neighbors[0][1]);
1333 
1334    if (dims == 1) {
1335       assert(!is_gather);
1336       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1337          /* Interpolate two samples from 1D image to produce one color */
1338          for (chan = 0; chan < 4; chan++) {
1339             colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
1340                                              neighbors[0][0][chan],
1341                                              neighbors[0][1][chan],
1342                                              0);
1343          }
1344       }
1345       else {
1346          LLVMValueRef cmpval0, cmpval1;
1347          cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1348          cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1349          /* simplified lerp, AND mask with weight and add */
1350          colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1351                                            cmpval0, cmpval1);
1352          colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1353       }
1354    }
1355    else {
1356       /* 2D/3D texture */
1357       struct lp_build_if_state corner_if;
1358       LLVMValueRef colors0[4], colorss[4];
1359 
1360       /* get x0/x1 texels at y1 */
1361       lp_build_sample_texel_soa(bld,
1362                                 width_vec, height_vec, depth_vec,
1363                                 x10, y10, z10,
1364                                 row_stride_vec, img_stride_vec,
1365                                 data_ptr, mipoffsets, neighbors[1][0]);
1366       lp_build_sample_texel_soa(bld,
1367                                 width_vec, height_vec, depth_vec,
1368                                 x11, y11, z11,
1369                                 row_stride_vec, img_stride_vec,
1370                                 data_ptr, mipoffsets, neighbors[1][1]);
1371 
1372       /*
1373        * To avoid having to duplicate linear_mask / fetch code use
1374        * another branch (with corner condition though edge would work
1375        * as well) here.
1376        */
1377       if (accurate_cube_corners) {
1378          LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1379          LLVMValueRef have_corner, one_third;
1380 
1381          colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1382          colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1383          colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1384          colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1385 
1386          have_corner = LLVMBuildLoad(builder, have_corners, "");
1387 
1388          lp_build_if(&corner_if, bld->gallivm, have_corner);
1389 
1390          one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1391                                         1.0f/3.0f);
1392 
1393          /* find corner */
1394          c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1395          c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1396          c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1397          c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1398          c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1399          c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1400          c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1401          c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1402 
1403          if (!is_gather) {
1404             /*
1405              * we can't use standard 2d lerp as we need per-element weight
1406              * in case of corners, so just calculate bilinear result as
1407              * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1408              * (This is actually less work than using 2d lerp, 7 vs. 9
1409              * instructions, however calculating the weights needs another 6,
1410              * so actually probably not slower than 2d lerp only for 4 channels
1411              * as weights only need to be calculated once - of course fixing
1412              * the weights has additional cost.)
1413              */
1414             LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1415             wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1416             wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1417             w00 = lp_build_mul(coord_bld, wx0, wy0);
1418             w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1419             w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1420             w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1421 
1422             /* find corner weight */
1423             c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1424             c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1425             c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1426             c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1427 
1428             /*
1429              * add 1/3 of the corner weight to the weight of the 3 other
1430              * samples and null out corner weight.
1431              */
1432             c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1433             w00 = lp_build_add(coord_bld, w00, c_weight);
1434             w00 = lp_build_andnot(coord_bld, w00, c00f);
1435             w01 = lp_build_add(coord_bld, w01, c_weight);
1436             w01 = lp_build_andnot(coord_bld, w01, c01f);
1437             w10 = lp_build_add(coord_bld, w10, c_weight);
1438             w10 = lp_build_andnot(coord_bld, w10, c10f);
1439             w11 = lp_build_add(coord_bld, w11, c_weight);
1440             w11 = lp_build_andnot(coord_bld, w11, c11f);
1441 
1442             if (bld->static_sampler_state->compare_mode ==
1443                 PIPE_TEX_COMPARE_NONE) {
1444                for (chan = 0; chan < 4; chan++) {
1445                   colors0[chan] = lp_build_mul(coord_bld, w00,
1446                                                neighbors[0][0][chan]);
1447                   tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1448                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1449                   tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1450                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1451                   tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1452                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1453                }
1454             }
1455             else {
1456                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1457                cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1458                                                       neighbors[0][0][0]);
1459                cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1460                                                       neighbors[0][1][0]);
1461                cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1462                                                       neighbors[1][0][0]);
1463                cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1464                                                       neighbors[1][1][0]);
1465                /*
1466                 * inputs to interpolation are just masks so just add
1467                 * masked weights together
1468                 */
1469                cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1470                                            coord_bld->vec_type, "");
1471                cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1472                                            coord_bld->vec_type, "");
1473                cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1474                                            coord_bld->vec_type, "");
1475                cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1476                                            coord_bld->vec_type, "");
1477                colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1478                tmp = lp_build_and(coord_bld, w01, cmpval01);
1479                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1480                tmp = lp_build_and(coord_bld, w10, cmpval10);
1481                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1482                tmp = lp_build_and(coord_bld, w11, cmpval11);
1483                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1484                colors0[1] = colors0[2] = colors0[3] = colors0[0];
1485             }
1486          }
1487          else {
1488             /*
1489              * We don't have any weights to adjust, so instead calculate
1490              * the fourth texel as simply the average of the other 3.
1491              * (This would work for non-gather too, however we'd have
1492              * a boatload more of the select stuff due to there being
1493              * 4 times as many colors as weights.)
1494              */
1495             LLVMValueRef col00, col01, col10, col11;
1496             LLVMValueRef colc, colc0, colc1;
1497             col10 = lp_build_swizzle_soa_channel(texel_bld,
1498                                                  neighbors[1][0], chan_swiz);
1499             col11 = lp_build_swizzle_soa_channel(texel_bld,
1500                                                  neighbors[1][1], chan_swiz);
1501             col01 = lp_build_swizzle_soa_channel(texel_bld,
1502                                                  neighbors[0][1], chan_swiz);
1503             col00 = lp_build_swizzle_soa_channel(texel_bld,
1504                                                  neighbors[0][0], chan_swiz);
1505 
1506             /*
1507              * The spec says for comparison filtering, the comparison
1508              * must happen before synthesizing the new value.
1509              * This means all gathered values are always 0 or 1,
1510              * except for the non-existing texel, which can be 0,1/3,2/3,1...
1511              * Seems like we'd be allowed to just return 0 or 1 too, so we
1512              * could simplify and pass down the compare mask values to the
1513              * end (using int arithmetic/compare on the mask values to
1514              * construct the fourth texel) and only there convert to floats
1515              * but it's probably not worth it (it might be easier for the cpu
1516              * but not for the code)...
1517              */
1518             if (bld->static_sampler_state->compare_mode !=
1519                 PIPE_TEX_COMPARE_NONE) {
1520                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1521                cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1522                cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1523                cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1524                cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1525                col00 = lp_build_select(texel_bld, cmpval00,
1526                                        texel_bld->one, texel_bld->zero);
1527                col01 = lp_build_select(texel_bld, cmpval01,
1528                                        texel_bld->one, texel_bld->zero);
1529                col10 = lp_build_select(texel_bld, cmpval10,
1530                                        texel_bld->one, texel_bld->zero);
1531                col11 = lp_build_select(texel_bld, cmpval11,
1532                                        texel_bld->one, texel_bld->zero);
1533             }
1534 
1535             /*
1536              * Null out corner color.
1537              */
1538             col00 = lp_build_andnot(coord_bld, col00, c00f);
1539             col01 = lp_build_andnot(coord_bld, col01, c01f);
1540             col10 = lp_build_andnot(coord_bld, col10, c10f);
1541             col11 = lp_build_andnot(coord_bld, col11, c11f);
1542 
1543             /*
1544              * New corner texel color is all colors added / 3.
1545              */
1546             colc0 = lp_build_add(coord_bld, col00, col01);
1547             colc1 = lp_build_add(coord_bld, col10, col11);
1548             colc = lp_build_add(coord_bld, colc0, colc1);
1549             colc = lp_build_mul(coord_bld, one_third, colc);
1550 
1551             /*
1552              * Replace the corner texel color with the new value.
1553              */
1554             col00 = lp_build_select(coord_bld, c00, colc, col00);
1555             col01 = lp_build_select(coord_bld, c01, colc, col01);
1556             col10 = lp_build_select(coord_bld, c10, colc, col10);
1557             col11 = lp_build_select(coord_bld, c11, colc, col11);
1558 
1559             colors0[0] = col10;
1560             colors0[1] = col11;
1561             colors0[2] = col01;
1562             colors0[3] = col00;
1563          }
1564 
1565          LLVMBuildStore(builder, colors0[0], colorss[0]);
1566          LLVMBuildStore(builder, colors0[1], colorss[1]);
1567          LLVMBuildStore(builder, colors0[2], colorss[2]);
1568          LLVMBuildStore(builder, colors0[3], colorss[3]);
1569 
1570          lp_build_else(&corner_if);
1571       }
1572 
1573       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1574          if (is_gather) {
1575             /*
1576              * Just assign the red channel (no component selection yet).
1577              * This is a bit hackish, we usually do the swizzle at the
1578              * end of sampling (much less values to swizzle), but this
1579              * obviously cannot work when using gather.
1580              */
1581             colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1582                                                       neighbors[1][0],
1583                                                       chan_swiz);
1584             colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1585                                                       neighbors[1][1],
1586                                                       chan_swiz);
1587             colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1588                                                       neighbors[0][1],
1589                                                       chan_swiz);
1590             colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1591                                                       neighbors[0][0],
1592                                                       chan_swiz);
1593          }
1594          else {
1595             /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1596             for (chan = 0; chan < 4; chan++) {
1597                colors0[chan] = lp_build_lerp_2d(texel_bld,
1598                                                 s_fpart, t_fpart,
1599                                                 neighbors[0][0][chan],
1600                                                 neighbors[0][1][chan],
1601                                                 neighbors[1][0][chan],
1602                                                 neighbors[1][1][chan],
1603                                                 0);
1604             }
1605          }
1606       }
1607       else {
1608          LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1609          cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1610          cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1611          cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1612          cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1613 
1614          if (is_gather) {
1615             /* more hacks for swizzling, should be X, ONE or ZERO... */
1616             colors0[0] = lp_build_select(texel_bld, cmpval10,
1617                                          texel_bld->one, texel_bld->zero);
1618             colors0[1] = lp_build_select(texel_bld, cmpval11,
1619                                          texel_bld->one, texel_bld->zero);
1620             colors0[2] = lp_build_select(texel_bld, cmpval01,
1621                                          texel_bld->one, texel_bld->zero);
1622             colors0[3] = lp_build_select(texel_bld, cmpval00,
1623                                          texel_bld->one, texel_bld->zero);
1624          }
1625          else {
1626             colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1627                                              cmpval00, cmpval01, cmpval10, cmpval11);
1628             colors0[1] = colors0[2] = colors0[3] = colors0[0];
1629          }
1630       }
1631 
1632       if (accurate_cube_corners) {
1633          LLVMBuildStore(builder, colors0[0], colorss[0]);
1634          LLVMBuildStore(builder, colors0[1], colorss[1]);
1635          LLVMBuildStore(builder, colors0[2], colorss[2]);
1636          LLVMBuildStore(builder, colors0[3], colorss[3]);
1637 
1638          lp_build_endif(&corner_if);
1639 
1640          colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1641          colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1642          colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1643          colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1644       }
1645 
1646       if (dims == 3) {
1647          LLVMValueRef neighbors1[2][2][4];
1648          LLVMValueRef colors1[4];
1649 
1650          assert(!is_gather);
1651 
1652          /* get x0/x1/y0/y1 texels at z1 */
1653          lp_build_sample_texel_soa(bld,
1654                                    width_vec, height_vec, depth_vec,
1655                                    x00, y00, z1,
1656                                    row_stride_vec, img_stride_vec,
1657                                    data_ptr, mipoffsets, neighbors1[0][0]);
1658          lp_build_sample_texel_soa(bld,
1659                                    width_vec, height_vec, depth_vec,
1660                                    x01, y01, z1,
1661                                    row_stride_vec, img_stride_vec,
1662                                    data_ptr, mipoffsets, neighbors1[0][1]);
1663          lp_build_sample_texel_soa(bld,
1664                                    width_vec, height_vec, depth_vec,
1665                                    x10, y10, z1,
1666                                    row_stride_vec, img_stride_vec,
1667                                    data_ptr, mipoffsets, neighbors1[1][0]);
1668          lp_build_sample_texel_soa(bld,
1669                                    width_vec, height_vec, depth_vec,
1670                                    x11, y11, z1,
1671                                    row_stride_vec, img_stride_vec,
1672                                    data_ptr, mipoffsets, neighbors1[1][1]);
1673 
1674          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1675             /* Bilinear interpolate the four samples from the second Z slice */
1676             for (chan = 0; chan < 4; chan++) {
1677                colors1[chan] = lp_build_lerp_2d(texel_bld,
1678                                                 s_fpart, t_fpart,
1679                                                 neighbors1[0][0][chan],
1680                                                 neighbors1[0][1][chan],
1681                                                 neighbors1[1][0][chan],
1682                                                 neighbors1[1][1][chan],
1683                                                 0);
1684             }
1685             /* Linearly interpolate the two samples from the two 3D slices */
1686             for (chan = 0; chan < 4; chan++) {
1687                colors_out[chan] = lp_build_lerp(texel_bld,
1688                                                 r_fpart,
1689                                                 colors0[chan], colors1[chan],
1690                                                 0);
1691             }
1692          }
1693          else {
1694             LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1695             cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1696             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1697             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1698             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1699             colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1700                                              cmpval00, cmpval01, cmpval10, cmpval11);
1701             /* Linearly interpolate the two samples from the two 3D slices */
1702             colors_out[0] = lp_build_lerp(texel_bld,
1703                                           r_fpart,
1704                                           colors0[0], colors1[0],
1705                                           0);
1706             colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1707          }
1708       }
1709       else {
1710          /* 2D tex */
1711          for (chan = 0; chan < 4; chan++) {
1712             colors_out[chan] = colors0[chan];
1713          }
1714       }
1715    }
1716    if (is_gather) {
1717       /*
1718        * For gather, we can't do our usual channel swizzling done later,
1719        * so do it here. It only really matters for 0/1 swizzles in case
1720        * of comparison filtering, since in this case the results would be
1721        * wrong, without comparison it should all work out alright but it
1722        * can't hurt to do that here, since it will instantly drop all
1723        * calculations above, though it's a rather stupid idea to do
1724        * gather on a channel which will always return 0 or 1 in any case...
1725        */
1726       if (chan_swiz == PIPE_SWIZZLE_1) {
1727          for (chan = 0; chan < 4; chan++) {
1728             colors_out[chan] = texel_bld->one;
1729          }
1730       } else if (chan_swiz == PIPE_SWIZZLE_0) {
1731          for (chan = 0; chan < 4; chan++) {
1732             colors_out[chan] = texel_bld->zero;
1733          }
1734       }
1735    }
1736 }
1737 
1738 
1739 /**
1740  * Sample the texture/mipmap using given image filter and mip filter.
1741  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1742  * from (vectors or scalars).
1743  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1744  */
1745 static void
lp_build_sample_mipmap(struct lp_build_sample_context * bld,unsigned img_filter,unsigned mip_filter,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef * colors_out)1746 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1747                        unsigned img_filter,
1748                        unsigned mip_filter,
1749                        boolean is_gather,
1750                        const LLVMValueRef *coords,
1751                        const LLVMValueRef *offsets,
1752                        LLVMValueRef ilevel0,
1753                        LLVMValueRef ilevel1,
1754                        LLVMValueRef lod_fpart,
1755                        LLVMValueRef *colors_out)
1756 {
1757    LLVMBuilderRef builder = bld->gallivm->builder;
1758    LLVMValueRef size0 = NULL;
1759    LLVMValueRef size1 = NULL;
1760    LLVMValueRef row_stride0_vec = NULL;
1761    LLVMValueRef row_stride1_vec = NULL;
1762    LLVMValueRef img_stride0_vec = NULL;
1763    LLVMValueRef img_stride1_vec = NULL;
1764    LLVMValueRef data_ptr0 = NULL;
1765    LLVMValueRef data_ptr1 = NULL;
1766    LLVMValueRef mipoff0 = NULL;
1767    LLVMValueRef mipoff1 = NULL;
1768    LLVMValueRef colors0[4], colors1[4];
1769    unsigned chan;
1770 
1771    /* sample the first mipmap level */
1772    lp_build_mipmap_level_sizes(bld, ilevel0,
1773                                &size0,
1774                                &row_stride0_vec, &img_stride0_vec);
1775    if (bld->num_mips == 1) {
1776       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1777    }
1778    else {
1779       /* This path should work for num_lods 1 too but slightly less efficient */
1780       data_ptr0 = bld->base_ptr;
1781       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1782    }
1783    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1784       lp_build_sample_image_nearest(bld, size0,
1785                                     row_stride0_vec, img_stride0_vec,
1786                                     data_ptr0, mipoff0, coords, offsets,
1787                                     colors0);
1788    }
1789    else {
1790       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1791       lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1792                                    row_stride0_vec, img_stride0_vec,
1793                                    data_ptr0, mipoff0, coords, offsets,
1794                                    colors0);
1795    }
1796 
1797    /* Store the first level's colors in the output variables */
1798    for (chan = 0; chan < 4; chan++) {
1799        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1800    }
1801 
1802    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1803       struct lp_build_if_state if_ctx;
1804       LLVMValueRef need_lerp;
1805 
1806       /* need_lerp = lod_fpart > 0 */
1807       if (bld->num_lods == 1) {
1808          need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1809                                    lod_fpart, bld->lodf_bld.zero,
1810                                    "need_lerp");
1811       }
1812       else {
1813          /*
1814           * We'll do mip filtering if any of the quads (or individual
1815           * pixel in case of per-pixel lod) need it.
1816           * It might be better to split the vectors here and only fetch/filter
1817           * quads which need it (if there's one lod per quad).
1818           */
1819          need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1820                                       PIPE_FUNC_GREATER,
1821                                       lod_fpart, bld->lodf_bld.zero);
1822          need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1823          lp_build_name(need_lerp, "need_lerp");
1824       }
1825 
1826       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1827       {
1828          /*
1829           * We unfortunately need to clamp lod_fpart here since we can get
1830           * negative values which would screw up filtering if not all
1831           * lod_fpart values have same sign.
1832           */
1833          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1834                                   bld->lodf_bld.zero);
1835          /* sample the second mipmap level */
1836          lp_build_mipmap_level_sizes(bld, ilevel1,
1837                                      &size1,
1838                                      &row_stride1_vec, &img_stride1_vec);
1839          if (bld->num_mips == 1) {
1840             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1841          }
1842          else {
1843             data_ptr1 = bld->base_ptr;
1844             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1845          }
1846          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1847             lp_build_sample_image_nearest(bld, size1,
1848                                           row_stride1_vec, img_stride1_vec,
1849                                           data_ptr1, mipoff1, coords, offsets,
1850                                           colors1);
1851          }
1852          else {
1853             lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1854                                          row_stride1_vec, img_stride1_vec,
1855                                          data_ptr1, mipoff1, coords, offsets,
1856                                          colors1);
1857          }
1858 
1859          /* interpolate samples from the two mipmap levels */
1860 
1861          if (bld->num_lods != bld->coord_type.length)
1862             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1863                                                               bld->lodf_bld.type,
1864                                                               bld->texel_bld.type,
1865                                                               lod_fpart);
1866 
1867          for (chan = 0; chan < 4; chan++) {
1868             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1869                                           colors0[chan], colors1[chan],
1870                                           0);
1871             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1872          }
1873       }
1874       lp_build_endif(&if_ctx);
1875    }
1876 }
1877 
1878 
1879 /**
1880  * Sample the texture/mipmap using given mip filter, and using
1881  * both nearest and linear filtering at the same time depending
1882  * on linear_mask.
1883  * lod can be per quad but linear_mask is always per pixel.
1884  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1885  * from (vectors or scalars).
1886  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1887  */
1888 static void
lp_build_sample_mipmap_both(struct lp_build_sample_context * bld,LLVMValueRef linear_mask,unsigned mip_filter,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef lod_positive,LLVMValueRef * colors_out)1889 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1890                             LLVMValueRef linear_mask,
1891                             unsigned mip_filter,
1892                             const LLVMValueRef *coords,
1893                             const LLVMValueRef *offsets,
1894                             LLVMValueRef ilevel0,
1895                             LLVMValueRef ilevel1,
1896                             LLVMValueRef lod_fpart,
1897                             LLVMValueRef lod_positive,
1898                             LLVMValueRef *colors_out)
1899 {
1900    LLVMBuilderRef builder = bld->gallivm->builder;
1901    LLVMValueRef size0 = NULL;
1902    LLVMValueRef size1 = NULL;
1903    LLVMValueRef row_stride0_vec = NULL;
1904    LLVMValueRef row_stride1_vec = NULL;
1905    LLVMValueRef img_stride0_vec = NULL;
1906    LLVMValueRef img_stride1_vec = NULL;
1907    LLVMValueRef data_ptr0 = NULL;
1908    LLVMValueRef data_ptr1 = NULL;
1909    LLVMValueRef mipoff0 = NULL;
1910    LLVMValueRef mipoff1 = NULL;
1911    LLVMValueRef colors0[4], colors1[4];
1912    unsigned chan;
1913 
1914    /* sample the first mipmap level */
1915    lp_build_mipmap_level_sizes(bld, ilevel0,
1916                                &size0,
1917                                &row_stride0_vec, &img_stride0_vec);
1918    if (bld->num_mips == 1) {
1919       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1920    }
1921    else {
1922       /* This path should work for num_lods 1 too but slightly less efficient */
1923       data_ptr0 = bld->base_ptr;
1924       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1925    }
1926 
1927    lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1928                                 row_stride0_vec, img_stride0_vec,
1929                                 data_ptr0, mipoff0, coords, offsets,
1930                                 colors0);
1931 
1932    /* Store the first level's colors in the output variables */
1933    for (chan = 0; chan < 4; chan++) {
1934        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1935    }
1936 
1937    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1938       struct lp_build_if_state if_ctx;
1939       LLVMValueRef need_lerp;
1940 
1941       /*
1942        * We'll do mip filtering if any of the quads (or individual
1943        * pixel in case of per-pixel lod) need it.
1944        * Note using lod_positive here not lod_fpart since it may be the same
1945        * condition as that used in the outer "if" in the caller hence llvm
1946        * should be able to merge the branches in this case.
1947        */
1948       need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1949       lp_build_name(need_lerp, "need_lerp");
1950 
1951       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1952       {
1953          /*
1954           * We unfortunately need to clamp lod_fpart here since we can get
1955           * negative values which would screw up filtering if not all
1956           * lod_fpart values have same sign.
1957           */
1958          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1959                                   bld->lodf_bld.zero);
1960          /* sample the second mipmap level */
1961          lp_build_mipmap_level_sizes(bld, ilevel1,
1962                                      &size1,
1963                                      &row_stride1_vec, &img_stride1_vec);
1964          if (bld->num_mips == 1) {
1965             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1966          }
1967          else {
1968             data_ptr1 = bld->base_ptr;
1969             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1970          }
1971 
1972          lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
1973                                       row_stride1_vec, img_stride1_vec,
1974                                       data_ptr1, mipoff1, coords, offsets,
1975                                       colors1);
1976 
1977          /* interpolate samples from the two mipmap levels */
1978 
1979          if (bld->num_lods != bld->coord_type.length)
1980             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1981                                                               bld->lodf_bld.type,
1982                                                               bld->texel_bld.type,
1983                                                               lod_fpart);
1984 
1985          for (chan = 0; chan < 4; chan++) {
1986             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1987                                           colors0[chan], colors1[chan],
1988                                           0);
1989             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1990          }
1991       }
1992       lp_build_endif(&if_ctx);
1993    }
1994 }
1995 
1996 
1997 /**
1998  * Build (per-coord) layer value.
1999  * Either clamp layer to valid values or fill in optional out_of_bounds
2000  * value and just return value unclamped.
2001  */
2002 static LLVMValueRef
lp_build_layer_coord(struct lp_build_sample_context * bld,unsigned texture_unit,boolean is_cube_array,LLVMValueRef layer,LLVMValueRef * out_of_bounds)2003 lp_build_layer_coord(struct lp_build_sample_context *bld,
2004                      unsigned texture_unit,
2005                      boolean is_cube_array,
2006                      LLVMValueRef layer,
2007                      LLVMValueRef *out_of_bounds)
2008 {
2009    LLVMValueRef num_layers;
2010    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2011 
2012    num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2013                                           bld->context_ptr, texture_unit);
2014 
2015    if (out_of_bounds) {
2016       LLVMValueRef out1, out;
2017       assert(!is_cube_array);
2018       num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2019       out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2020       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2021       *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2022       return layer;
2023    }
2024    else {
2025       LLVMValueRef maxlayer;
2026       LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2027                                        bld->int_bld.one;
2028       maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2029       maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2030       return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2031    }
2032 }
2033 
2034 
2035 /**
2036  * Calculate cube face, lod, mip levels.
2037  */
2038 static void
lp_build_sample_common(struct lp_build_sample_context * bld,boolean is_lodq,unsigned texture_index,unsigned sampler_index,LLVMValueRef * coords,const struct lp_derivatives * derivs,LLVMValueRef lod_bias,LLVMValueRef explicit_lod,LLVMValueRef * lod_pos_or_zero,LLVMValueRef * lod,LLVMValueRef * lod_fpart,LLVMValueRef * ilevel0,LLVMValueRef * ilevel1)2039 lp_build_sample_common(struct lp_build_sample_context *bld,
2040                        boolean is_lodq,
2041                        unsigned texture_index,
2042                        unsigned sampler_index,
2043                        LLVMValueRef *coords,
2044                        const struct lp_derivatives *derivs, /* optional */
2045                        LLVMValueRef lod_bias, /* optional */
2046                        LLVMValueRef explicit_lod, /* optional */
2047                        LLVMValueRef *lod_pos_or_zero,
2048                        LLVMValueRef *lod,
2049                        LLVMValueRef *lod_fpart,
2050                        LLVMValueRef *ilevel0,
2051                        LLVMValueRef *ilevel1)
2052 {
2053    const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2054    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2055    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2056    const unsigned target = bld->static_texture_state->target;
2057    LLVMValueRef first_level, cube_rho = NULL;
2058    LLVMValueRef lod_ipart = NULL;
2059    struct lp_derivatives cube_derivs;
2060 
2061    /*
2062    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
2063           mip_filter, min_filter, mag_filter);
2064    */
2065 
2066    /*
2067     * Choose cube face, recompute texcoords for the chosen face and
2068     * compute rho here too (as it requires transform of derivatives).
2069     */
2070    if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2071       boolean need_derivs;
2072       need_derivs = ((min_filter != mag_filter ||
2073                       mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2074                       !bld->static_sampler_state->min_max_lod_equal &&
2075                       !explicit_lod);
2076       lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2077       derivs = &cube_derivs;
2078       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
2079          /* calculate cube layer coord now */
2080          LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2081          LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2082          layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2083          coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2084          /* because of seamless filtering can't add it to face (coords[2]) here. */
2085       }
2086    }
2087    else if (target == PIPE_TEXTURE_1D_ARRAY ||
2088             target == PIPE_TEXTURE_2D_ARRAY) {
2089       coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2090       coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2091    }
2092 
2093    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2094       /*
2095        * Clamp p coords to [0,1] for fixed function depth texture format here.
2096        * Technically this is not entirely correct for unorm depth as the ref value
2097        * should be converted to the depth format (quantization!) and comparison
2098        * then done in texture format. This would actually help performance (since
2099        * only need to do it once and could save the per-sample conversion of texels
2100        * to floats instead), but it would need more messy code (would need to push
2101        * at least some bits down to actual fetch so conversion could be skipped,
2102        * and would have ugly interaction with border color, would need to convert
2103        * border color to that format too or do some other tricks to make it work).
2104        */
2105       const struct util_format_description *format_desc = bld->format_desc;
2106       unsigned chan_type;
2107       /* not entirely sure we couldn't end up with non-valid swizzle here */
2108       chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2109                      format_desc->channel[format_desc->swizzle[0]].type :
2110                      UTIL_FORMAT_TYPE_FLOAT;
2111       if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2112          coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2113                                     bld->coord_bld.zero, bld->coord_bld.one);
2114       }
2115    }
2116 
2117    /*
2118     * Compute the level of detail (float).
2119     */
2120    if (min_filter != mag_filter ||
2121        mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2122       /* Need to compute lod either to choose mipmap levels or to
2123        * distinguish between minification/magnification with one mipmap level.
2124        */
2125       lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2126                             coords[0], coords[1], coords[2], cube_rho,
2127                             derivs, lod_bias, explicit_lod,
2128                             mip_filter, lod,
2129                             &lod_ipart, lod_fpart, lod_pos_or_zero);
2130       if (is_lodq) {
2131          LLVMValueRef last_level;
2132          last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2133                                                      bld->gallivm,
2134                                                      bld->context_ptr,
2135                                                      texture_index);
2136          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2137                                                        bld->gallivm,
2138                                                        bld->context_ptr,
2139                                                        texture_index);
2140          last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2141          last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2142          last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2143 
2144          switch (mip_filter) {
2145          case PIPE_TEX_MIPFILTER_NONE:
2146             *lod_fpart = bld->lodf_bld.zero;
2147             break;
2148          case PIPE_TEX_MIPFILTER_NEAREST:
2149              *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2150              /* fallthrough */
2151          case PIPE_TEX_MIPFILTER_LINEAR:
2152             *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2153                                         bld->lodf_bld.zero, last_level);
2154             break;
2155          }
2156          return;
2157       }
2158 
2159    } else {
2160       lod_ipart = bld->lodi_bld.zero;
2161       *lod_pos_or_zero = bld->lodi_bld.zero;
2162    }
2163 
2164    if (bld->num_lods != bld->num_mips) {
2165       /* only makes sense if there's just a single mip level */
2166       assert(bld->num_mips == 1);
2167       lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2168    }
2169 
2170    /*
2171     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2172     */
2173    switch (mip_filter) {
2174    default:
2175       assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2176       /* fall-through */
2177    case PIPE_TEX_MIPFILTER_NONE:
2178       /* always use mip level 0 */
2179       first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2180                                                     bld->gallivm, bld->context_ptr,
2181                                                     texture_index);
2182       first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2183       *ilevel0 = first_level;
2184       break;
2185    case PIPE_TEX_MIPFILTER_NEAREST:
2186       assert(lod_ipart);
2187       lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2188       break;
2189    case PIPE_TEX_MIPFILTER_LINEAR:
2190       assert(lod_ipart);
2191       assert(*lod_fpart);
2192       lp_build_linear_mip_levels(bld, texture_index,
2193                                  lod_ipart, lod_fpart,
2194                                  ilevel0, ilevel1);
2195       break;
2196    }
2197 }
2198 
2199 static void
lp_build_clamp_border_color(struct lp_build_sample_context * bld,unsigned sampler_unit)2200 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2201                             unsigned sampler_unit)
2202 {
2203    struct gallivm_state *gallivm = bld->gallivm;
2204    LLVMBuilderRef builder = gallivm->builder;
2205    LLVMValueRef border_color_ptr =
2206       bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2207                                        bld->context_ptr, sampler_unit);
2208    LLVMValueRef border_color;
2209    const struct util_format_description *format_desc = bld->format_desc;
2210    struct lp_type vec4_type = bld->texel_type;
2211    struct lp_build_context vec4_bld;
2212    LLVMValueRef min_clamp = NULL;
2213    LLVMValueRef max_clamp = NULL;
2214 
2215    /*
2216     * For normalized format need to clamp border color (technically
2217     * probably should also quantize the data). Really sucks doing this
2218     * here but can't avoid at least for now since this is part of
2219     * sampler state and texture format is part of sampler_view state.
2220     * GL expects also expects clamping for uint/sint formats too so
2221     * do that as well (d3d10 can't end up here with uint/sint since it
2222     * only supports them with ld).
2223     */
2224    vec4_type.length = 4;
2225    lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2226 
2227    /*
2228     * Vectorized clamping of border color. Loading is a bit of a hack since
2229     * we just cast the pointer to float array to pointer to vec4
2230     * (int or float).
2231     */
2232    border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2233                                              lp_build_const_int32(gallivm, 0));
2234    border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2235                                        LLVMPointerType(vec4_bld.vec_type, 0), "");
2236    border_color = LLVMBuildLoad(builder, border_color_ptr, "");
2237    /* we don't have aligned type in the dynamic state unfortunately */
2238    LLVMSetAlignment(border_color, 4);
2239 
2240    /*
2241     * Instead of having some incredibly complex logic which will try to figure out
2242     * clamping necessary for each channel, simply use the first channel, and treat
2243     * mixed signed/unsigned normalized formats specially.
2244     * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
2245     * good reason.)
2246     */
2247    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2248       int chan;
2249       /* d/s needs special handling because both present means just sampling depth */
2250       if (util_format_is_depth_and_stencil(format_desc->format)) {
2251          chan = format_desc->swizzle[0];
2252       }
2253       else {
2254          chan = util_format_get_first_non_void_channel(format_desc->format);
2255       }
2256       if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2257          unsigned chan_type = format_desc->channel[chan].type;
2258          unsigned chan_norm = format_desc->channel[chan].normalized;
2259          unsigned chan_pure = format_desc->channel[chan].pure_integer;
2260          if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2261             if (chan_norm) {
2262                min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2263                max_clamp = vec4_bld.one;
2264             }
2265             else if (chan_pure) {
2266                /*
2267                 * Border color was stored as int, hence need min/max clamp
2268                 * only if chan has less than 32 bits..
2269                 */
2270                unsigned chan_size = format_desc->channel[chan].size;
2271                if (chan_size < 32) {
2272                   min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2273                                                      0 - (1 << (chan_size - 1)));
2274                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2275                                                      (1 << (chan_size - 1)) - 1);
2276                }
2277             }
2278             /* TODO: no idea about non-pure, non-normalized! */
2279          }
2280          else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2281             if (chan_norm) {
2282                min_clamp = vec4_bld.zero;
2283                max_clamp = vec4_bld.one;
2284             }
2285             /*
2286              * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2287              * we use Z32_FLOAT_S8X24 to imply sampling depth component
2288              * and ignoring stencil, which will blow up here if we try to
2289              * do a uint clamp in a float texel build...
2290              * And even if we had that format, mesa st also thinks using z24s8
2291              * means depth sampling ignoring stencil.
2292              */
2293             else if (chan_pure) {
2294                /*
2295                 * Border color was stored as uint, hence never need min
2296                 * clamp, and only need max clamp if chan has less than 32 bits.
2297                 */
2298                unsigned chan_size = format_desc->channel[chan].size;
2299                if (chan_size < 32) {
2300                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2301                                                      (1 << chan_size) - 1);
2302                }
2303                /* TODO: no idea about non-pure, non-normalized! */
2304             }
2305          }
2306          else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2307             /* TODO: I have no idea what clamp this would need if any! */
2308          }
2309       }
2310       /* mixed plain formats (or different pure size) */
2311       switch (format_desc->format) {
2312       case PIPE_FORMAT_B10G10R10A2_UINT:
2313       case PIPE_FORMAT_R10G10B10A2_UINT:
2314       {
2315          unsigned max10 = (1 << 10) - 1;
2316          max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2317                                         max10, (1 << 2) - 1, NULL);
2318       }
2319          break;
2320       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2321          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2322                                         -1.0F, 0.0F, NULL);
2323          max_clamp = vec4_bld.one;
2324          break;
2325       case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2326       case PIPE_FORMAT_R5SG5SB6U_NORM:
2327          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2328                                         0.0F, 0.0F, NULL);
2329          max_clamp = vec4_bld.one;
2330          break;
2331       default:
2332          break;
2333       }
2334    }
2335    else {
2336       /* cannot figure this out from format description */
2337       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2338          /* s3tc formats are always unorm */
2339          min_clamp = vec4_bld.zero;
2340          max_clamp = vec4_bld.one;
2341       }
2342       else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2343                format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
2344          switch (format_desc->format) {
2345          case PIPE_FORMAT_RGTC1_UNORM:
2346          case PIPE_FORMAT_RGTC2_UNORM:
2347          case PIPE_FORMAT_LATC1_UNORM:
2348          case PIPE_FORMAT_LATC2_UNORM:
2349          case PIPE_FORMAT_ETC1_RGB8:
2350             min_clamp = vec4_bld.zero;
2351             max_clamp = vec4_bld.one;
2352             break;
2353          case PIPE_FORMAT_RGTC1_SNORM:
2354          case PIPE_FORMAT_RGTC2_SNORM:
2355          case PIPE_FORMAT_LATC1_SNORM:
2356          case PIPE_FORMAT_LATC2_SNORM:
2357             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2358             max_clamp = vec4_bld.one;
2359             break;
2360          default:
2361             assert(0);
2362             break;
2363          }
2364       }
2365       /*
2366        * all others from subsampled/other group, though we don't care
2367        * about yuv (and should not have any from zs here)
2368        */
2369       else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2370          switch (format_desc->format) {
2371          case PIPE_FORMAT_R8G8_B8G8_UNORM:
2372          case PIPE_FORMAT_G8R8_G8B8_UNORM:
2373          case PIPE_FORMAT_G8R8_B8R8_UNORM:
2374          case PIPE_FORMAT_R8G8_R8B8_UNORM:
2375          case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2376             min_clamp = vec4_bld.zero;
2377             max_clamp = vec4_bld.one;
2378             break;
2379          case PIPE_FORMAT_R8G8Bx_SNORM:
2380             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2381             max_clamp = vec4_bld.one;
2382             break;
2383             /*
2384              * Note smallfloat formats usually don't need clamping
2385              * (they still have infinite range) however this is not
2386              * true for r11g11b10 and r9g9b9e5, which can't represent
2387              * negative numbers (and additionally r9g9b9e5 can't represent
2388              * very large numbers). d3d10 seems happy without clamping in
2389              * this case, but gl spec is pretty clear: "for floating
2390              * point and integer formats, border values are clamped to
2391              * the representable range of the format" so do that here.
2392              */
2393          case PIPE_FORMAT_R11G11B10_FLOAT:
2394             min_clamp = vec4_bld.zero;
2395             break;
2396          case PIPE_FORMAT_R9G9B9E5_FLOAT:
2397             min_clamp = vec4_bld.zero;
2398             max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2399             break;
2400          default:
2401             assert(0);
2402             break;
2403          }
2404       }
2405    }
2406 
2407    if (min_clamp) {
2408       border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2409    }
2410    if (max_clamp) {
2411       border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2412    }
2413 
2414    bld->border_color_clamped = border_color;
2415 }
2416 
2417 
2418 /**
2419  * General texture sampling codegen.
2420  * This function handles texture sampling for all texture targets (1D,
2421  * 2D, 3D, cube) and all filtering modes.
2422  */
2423 static void
lp_build_sample_general(struct lp_build_sample_context * bld,unsigned sampler_unit,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef lod_positive,LLVMValueRef lod_fpart,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef * colors_out)2424 lp_build_sample_general(struct lp_build_sample_context *bld,
2425                         unsigned sampler_unit,
2426                         boolean is_gather,
2427                         const LLVMValueRef *coords,
2428                         const LLVMValueRef *offsets,
2429                         LLVMValueRef lod_positive,
2430                         LLVMValueRef lod_fpart,
2431                         LLVMValueRef ilevel0,
2432                         LLVMValueRef ilevel1,
2433                         LLVMValueRef *colors_out)
2434 {
2435    LLVMBuilderRef builder = bld->gallivm->builder;
2436    const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2437    const unsigned mip_filter = sampler_state->min_mip_filter;
2438    const unsigned min_filter = sampler_state->min_img_filter;
2439    const unsigned mag_filter = sampler_state->mag_img_filter;
2440    LLVMValueRef texels[4];
2441    unsigned chan;
2442 
2443    /* if we need border color, (potentially) clamp it now */
2444    if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2445                                               min_filter,
2446                                               mag_filter) ||
2447        (bld->dims > 1 &&
2448            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2449                                                   min_filter,
2450                                                   mag_filter)) ||
2451        (bld->dims > 2 &&
2452            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2453                                                   min_filter,
2454                                                   mag_filter))) {
2455       lp_build_clamp_border_color(bld, sampler_unit);
2456    }
2457 
2458 
2459    /*
2460     * Get/interpolate texture colors.
2461     */
2462 
2463    for (chan = 0; chan < 4; ++chan) {
2464      texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2465      lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2466    }
2467 
2468    if (min_filter == mag_filter) {
2469       /* no need to distinguish between minification and magnification */
2470       lp_build_sample_mipmap(bld, min_filter, mip_filter,
2471                              is_gather,
2472                              coords, offsets,
2473                              ilevel0, ilevel1, lod_fpart,
2474                              texels);
2475    }
2476    else {
2477       /*
2478        * Could also get rid of the if-logic and always use mipmap_both, both
2479        * for the single lod and multi-lod case if nothing really uses this.
2480        */
2481       if (bld->num_lods == 1) {
2482          /* Emit conditional to choose min image filter or mag image filter
2483           * depending on the lod being > 0 or <= 0, respectively.
2484           */
2485          struct lp_build_if_state if_ctx;
2486 
2487          lod_positive = LLVMBuildTrunc(builder, lod_positive,
2488                                        LLVMInt1TypeInContext(bld->gallivm->context),
2489                                        "lod_pos");
2490 
2491          lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2492          {
2493             /* Use the minification filter */
2494             lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
2495                                    coords, offsets,
2496                                    ilevel0, ilevel1, lod_fpart,
2497                                    texels);
2498          }
2499          lp_build_else(&if_ctx);
2500          {
2501             /* Use the magnification filter */
2502             lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2503                                    FALSE,
2504                                    coords, offsets,
2505                                    ilevel0, NULL, NULL,
2506                                    texels);
2507          }
2508          lp_build_endif(&if_ctx);
2509       }
2510       else {
2511          LLVMValueRef need_linear, linear_mask;
2512          unsigned mip_filter_for_nearest;
2513          struct lp_build_if_state if_ctx;
2514 
2515          if (min_filter == PIPE_TEX_FILTER_LINEAR) {
2516             linear_mask = lod_positive;
2517             mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
2518          }
2519          else {
2520             linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
2521             mip_filter_for_nearest = mip_filter;
2522          }
2523          need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
2524                                                linear_mask);
2525          lp_build_name(need_linear, "need_linear");
2526 
2527          if (bld->num_lods != bld->coord_type.length) {
2528             linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2529                                                                 bld->lodi_type,
2530                                                                 bld->int_coord_type,
2531                                                                 linear_mask);
2532          }
2533 
2534          lp_build_if(&if_ctx, bld->gallivm, need_linear);
2535          {
2536             /*
2537              * Do sampling with both filters simultaneously. This means using
2538              * a linear filter and doing some tricks (with weights) for the pixels
2539              * which need nearest filter.
2540              * Note that it's probably rare some pixels need nearest and some
2541              * linear filter but the fixups required for the nearest pixels
2542              * aren't all that complicated so just always run a combined path
2543              * if at least some pixels require linear.
2544              */
2545             lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
2546                                         coords, offsets,
2547                                         ilevel0, ilevel1,
2548                                         lod_fpart, lod_positive,
2549                                         texels);
2550          }
2551          lp_build_else(&if_ctx);
2552          {
2553             /*
2554              * All pixels require just nearest filtering, which is way
2555              * cheaper than linear, hence do a separate path for that.
2556              */
2557             lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
2558                                    mip_filter_for_nearest, FALSE,
2559                                    coords, offsets,
2560                                    ilevel0, ilevel1, lod_fpart,
2561                                    texels);
2562          }
2563          lp_build_endif(&if_ctx);
2564       }
2565    }
2566 
2567    for (chan = 0; chan < 4; ++chan) {
2568      colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
2569      lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
2570    }
2571 }
2572 
2573 
2574 /**
2575  * Texel fetch function.
2576  * In contrast to general sampling there is no filtering, no coord minification,
2577  * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
2578  * directly to be applied to the selected mip level (after adding texel offsets).
2579  * This function handles texel fetch for all targets where texel fetch is supported
2580  * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
2581  */
2582 static void
lp_build_fetch_texel(struct lp_build_sample_context * bld,unsigned texture_unit,const LLVMValueRef * coords,LLVMValueRef explicit_lod,const LLVMValueRef * offsets,LLVMValueRef * colors_out)2583 lp_build_fetch_texel(struct lp_build_sample_context *bld,
2584                      unsigned texture_unit,
2585                      const LLVMValueRef *coords,
2586                      LLVMValueRef explicit_lod,
2587                      const LLVMValueRef *offsets,
2588                      LLVMValueRef *colors_out)
2589 {
2590    struct lp_build_context *perquadi_bld = &bld->lodi_bld;
2591    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2592    unsigned dims = bld->dims, chan;
2593    unsigned target = bld->static_texture_state->target;
2594    boolean out_of_bound_ret_zero = TRUE;
2595    LLVMValueRef size, ilevel;
2596    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
2597    LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
2598    LLVMValueRef width, height, depth, i, j;
2599    LLVMValueRef offset, out_of_bounds, out1;
2600 
2601    out_of_bounds = int_coord_bld->zero;
2602 
2603    if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
2604       if (bld->num_mips != int_coord_bld->type.length) {
2605          ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
2606                                             perquadi_bld->type, explicit_lod, 0);
2607       }
2608       else {
2609          ilevel = explicit_lod;
2610       }
2611       lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
2612                                  out_of_bound_ret_zero ? &out_of_bounds : NULL);
2613    }
2614    else {
2615       assert(bld->num_mips == 1);
2616       if (bld->static_texture_state->target != PIPE_BUFFER) {
2617          ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
2618                                                   bld->context_ptr, texture_unit);
2619       }
2620       else {
2621          ilevel = lp_build_const_int32(bld->gallivm, 0);
2622       }
2623    }
2624    lp_build_mipmap_level_sizes(bld, ilevel,
2625                                &size,
2626                                &row_stride_vec, &img_stride_vec);
2627    lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
2628                                 size, &width, &height, &depth);
2629 
2630    if (target == PIPE_TEXTURE_1D_ARRAY ||
2631        target == PIPE_TEXTURE_2D_ARRAY) {
2632       if (out_of_bound_ret_zero) {
2633          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
2634          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2635       }
2636       else {
2637          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
2638       }
2639    }
2640 
2641    /* This is a lot like border sampling */
2642    if (offsets[0]) {
2643       /*
2644        * coords are really unsigned, offsets are signed, but I don't think
2645        * exceeding 31 bits is possible
2646        */
2647       x = lp_build_add(int_coord_bld, x, offsets[0]);
2648    }
2649    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
2650    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2651    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
2652    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2653 
2654    if (dims >= 2) {
2655       if (offsets[1]) {
2656          y = lp_build_add(int_coord_bld, y, offsets[1]);
2657       }
2658       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
2659       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2660       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
2661       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2662 
2663       if (dims >= 3) {
2664          if (offsets[2]) {
2665             z = lp_build_add(int_coord_bld, z, offsets[2]);
2666          }
2667          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
2668          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2669          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
2670          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2671       }
2672    }
2673 
2674    lp_build_sample_offset(int_coord_bld,
2675                           bld->format_desc,
2676                           x, y, z, row_stride_vec, img_stride_vec,
2677                           &offset, &i, &j);
2678 
2679    if (bld->static_texture_state->target != PIPE_BUFFER) {
2680       offset = lp_build_add(int_coord_bld, offset,
2681                             lp_build_get_mip_offsets(bld, ilevel));
2682    }
2683 
2684    offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
2685 
2686    lp_build_fetch_rgba_soa(bld->gallivm,
2687                            bld->format_desc,
2688                            bld->texel_type, TRUE,
2689                            bld->base_ptr, offset,
2690                            i, j,
2691                            bld->cache,
2692                            colors_out);
2693 
2694    if (out_of_bound_ret_zero) {
2695       /*
2696        * Only needed for ARB_robust_buffer_access_behavior and d3d10.
2697        * Could use min/max above instead of out-of-bounds comparisons
2698        * if we don't care about the result returned for out-of-bounds.
2699        */
2700       for (chan = 0; chan < 4; chan++) {
2701          colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
2702                                             bld->texel_bld.zero, colors_out[chan]);
2703       }
2704    }
2705 }
2706 
2707 
2708 /**
2709  * Just set texels to white instead of actually sampling the texture.
2710  * For debugging.
2711  */
2712 void
lp_build_sample_nop(struct gallivm_state * gallivm,struct lp_type type,const LLVMValueRef * coords,LLVMValueRef texel_out[4])2713 lp_build_sample_nop(struct gallivm_state *gallivm,
2714                     struct lp_type type,
2715                     const LLVMValueRef *coords,
2716                     LLVMValueRef texel_out[4])
2717 {
2718    LLVMValueRef one = lp_build_one(gallivm, type);
2719    unsigned chan;
2720 
2721    for (chan = 0; chan < 4; chan++) {
2722       texel_out[chan] = one;
2723    }
2724 }
2725 
2726 
2727 /**
2728  * Build the actual texture sampling code.
2729  * 'texel' will return a vector of four LLVMValueRefs corresponding to
2730  * R, G, B, A.
2731  * \param type  vector float type to use for coords, etc.
2732  * \param sample_key
2733  * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
2734  */
2735 static void
lp_build_sample_soa_code(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct lp_type type,unsigned sample_key,unsigned texture_index,unsigned sampler_index,LLVMValueRef context_ptr,LLVMValueRef thread_data_ptr,const LLVMValueRef * coords,const LLVMValueRef * offsets,const struct lp_derivatives * derivs,LLVMValueRef lod,LLVMValueRef texel_out[4])2736 lp_build_sample_soa_code(struct gallivm_state *gallivm,
2737                          const struct lp_static_texture_state *static_texture_state,
2738                          const struct lp_static_sampler_state *static_sampler_state,
2739                          struct lp_sampler_dynamic_state *dynamic_state,
2740                          struct lp_type type,
2741                          unsigned sample_key,
2742                          unsigned texture_index,
2743                          unsigned sampler_index,
2744                          LLVMValueRef context_ptr,
2745                          LLVMValueRef thread_data_ptr,
2746                          const LLVMValueRef *coords,
2747                          const LLVMValueRef *offsets,
2748                          const struct lp_derivatives *derivs, /* optional */
2749                          LLVMValueRef lod, /* optional */
2750                          LLVMValueRef texel_out[4])
2751 {
2752    unsigned target = static_texture_state->target;
2753    unsigned dims = texture_dims(target);
2754    unsigned num_quads = type.length / 4;
2755    unsigned mip_filter, min_img_filter, mag_img_filter, i;
2756    struct lp_build_sample_context bld;
2757    struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
2758    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2759    LLVMBuilderRef builder = gallivm->builder;
2760    LLVMValueRef tex_width, newcoords[5];
2761    enum lp_sampler_lod_property lod_property;
2762    enum lp_sampler_lod_control lod_control;
2763    enum lp_sampler_op_type op_type;
2764    LLVMValueRef lod_bias = NULL;
2765    LLVMValueRef explicit_lod = NULL;
2766    boolean op_is_tex, op_is_lodq, op_is_gather;
2767 
2768    if (0) {
2769       enum pipe_format fmt = static_texture_state->format;
2770       debug_printf("Sample from %s\n", util_format_name(fmt));
2771    }
2772 
2773    lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
2774                      LP_SAMPLER_LOD_PROPERTY_SHIFT;
2775    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
2776                     LP_SAMPLER_LOD_CONTROL_SHIFT;
2777    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
2778                  LP_SAMPLER_OP_TYPE_SHIFT;
2779 
2780    op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
2781    op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
2782    op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
2783 
2784    if (lod_control == LP_SAMPLER_LOD_BIAS) {
2785       lod_bias = lod;
2786       assert(lod);
2787       assert(derivs == NULL);
2788    }
2789    else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
2790       explicit_lod = lod;
2791       assert(lod);
2792       assert(derivs == NULL);
2793    }
2794    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
2795       assert(derivs);
2796       assert(lod == NULL);
2797    }
2798    else {
2799       assert(derivs == NULL);
2800       assert(lod == NULL);
2801    }
2802 
2803    if (static_texture_state->format == PIPE_FORMAT_NONE) {
2804       /*
2805        * If there's nothing bound, format is NONE, and we must return
2806        * all zero as mandated by d3d10 in this case.
2807        */
2808       unsigned chan;
2809       LLVMValueRef zero = lp_build_zero(gallivm, type);
2810       for (chan = 0; chan < 4; chan++) {
2811          texel_out[chan] = zero;
2812       }
2813       return;
2814    }
2815 
2816    assert(type.floating);
2817 
2818    /* Setup our build context */
2819    memset(&bld, 0, sizeof bld);
2820    bld.gallivm = gallivm;
2821    bld.context_ptr = context_ptr;
2822    bld.static_sampler_state = &derived_sampler_state;
2823    bld.static_texture_state = static_texture_state;
2824    bld.dynamic_state = dynamic_state;
2825    bld.format_desc = util_format_description(static_texture_state->format);
2826    bld.dims = dims;
2827 
2828    if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD || op_is_lodq) {
2829       bld.no_quad_lod = TRUE;
2830    }
2831    if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX || op_is_lodq) {
2832       bld.no_rho_approx = TRUE;
2833    }
2834    if (gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR || op_is_lodq) {
2835       bld.no_brilinear = TRUE;
2836    }
2837 
2838    bld.vector_width = lp_type_width(type);
2839 
2840    bld.float_type = lp_type_float(32);
2841    bld.int_type = lp_type_int(32);
2842    bld.coord_type = type;
2843    bld.int_coord_type = lp_int_type(type);
2844    bld.float_size_in_type = lp_type_float(32);
2845    bld.float_size_in_type.length = dims > 1 ? 4 : 1;
2846    bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
2847    bld.texel_type = type;
2848 
2849    /* always using the first channel hopefully should be safe,
2850     * if not things WILL break in other places anyway.
2851     */
2852    if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
2853        bld.format_desc->channel[0].pure_integer) {
2854       if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
2855          bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2856       }
2857       else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2858          bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2859       }
2860    }
2861    else if (util_format_has_stencil(bld.format_desc) &&
2862        !util_format_has_depth(bld.format_desc)) {
2863       /* for stencil only formats, sample stencil (uint) */
2864       bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2865    }
2866 
2867    if (!static_texture_state->level_zero_only ||
2868        !static_sampler_state->max_lod_pos || op_is_lodq) {
2869       derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
2870    } else {
2871       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2872    }
2873    if (op_is_gather) {
2874       /*
2875        * gather4 is exactly like GL_LINEAR filtering but in the end skipping
2876        * the actual filtering. Using mostly the same paths, so cube face
2877        * selection, coord wrapping etc. all naturally uses the same code.
2878        */
2879       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2880       derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
2881       derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
2882    }
2883    mip_filter = derived_sampler_state.min_mip_filter;
2884 
2885    if (0) {
2886       debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
2887    }
2888 
2889    if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2890        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2891    {
2892       /*
2893        * Seamless filtering ignores wrap modes.
2894        * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
2895        * bilinear it's not correct but way better than using for instance repeat.
2896        * Note we even set this for non-seamless. Technically GL allows any wrap
2897        * mode, which made sense when supporting true borders (can get seamless
2898        * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
2899        * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
2900        * up the sampler state (as it makes it texture dependent).
2901        */
2902       derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2903       derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2904    }
2905    /*
2906     * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
2907     * so AoS path could be used. Not sure it's worth the trouble...
2908     */
2909 
2910    min_img_filter = derived_sampler_state.min_img_filter;
2911    mag_img_filter = derived_sampler_state.mag_img_filter;
2912 
2913 
2914    /*
2915     * This is all a bit complicated different paths are chosen for performance
2916     * reasons.
2917     * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
2918     * everything (the last two options are equivalent for 4-wide case).
2919     * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
2920     * lod is calculated then the lod value extracted afterwards so making this
2921     * case basically the same as far as lod handling is concerned for the
2922     * further sample/filter code as the 1 lod for everything case.
2923     * Different lod handling mostly shows up when building mipmap sizes
2924     * (lp_build_mipmap_level_sizes() and friends) and also in filtering
2925     * (getting the fractional part of the lod to the right texels).
2926     */
2927 
2928    /*
2929     * There are other situations where at least the multiple int lods could be
2930     * avoided like min and max lod being equal.
2931     */
2932    bld.num_mips = bld.num_lods = 1;
2933 
2934    if (bld.no_quad_lod && bld.no_rho_approx &&
2935        ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
2936          (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2937           static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
2938         op_is_lodq)) {
2939       /*
2940        * special case for using per-pixel lod even for implicit lod,
2941        * which is generally never required (ok by APIs) except to please
2942        * some (somewhat broken imho) tests (because per-pixel face selection
2943        * can cause derivatives to be different for pixels outside the primitive
2944        * due to the major axis division even if pre-project derivatives are
2945        * looking normal).
2946        * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
2947        * cube maps we do indeed get per-pixel lod values).
2948        */
2949       bld.num_mips = type.length;
2950       bld.num_lods = type.length;
2951    }
2952    else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
2953        (explicit_lod || lod_bias || derivs)) {
2954       if ((!op_is_tex && target != PIPE_BUFFER) ||
2955           (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2956          bld.num_mips = type.length;
2957          bld.num_lods = type.length;
2958       }
2959       else if (op_is_tex && min_img_filter != mag_img_filter) {
2960          bld.num_mips = 1;
2961          bld.num_lods = type.length;
2962       }
2963    }
2964    /* TODO: for true scalar_lod should only use 1 lod value */
2965    else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
2966             (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2967       bld.num_mips = num_quads;
2968       bld.num_lods = num_quads;
2969    }
2970    else if (op_is_tex && min_img_filter != mag_img_filter) {
2971       bld.num_mips = 1;
2972       bld.num_lods = num_quads;
2973    }
2974 
2975 
2976    bld.lodf_type = type;
2977    /* we want native vector size to be able to use our intrinsics */
2978    if (bld.num_lods != type.length) {
2979       /* TODO: this currently always has to be per-quad or per-element */
2980       bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
2981    }
2982    bld.lodi_type = lp_int_type(bld.lodf_type);
2983    bld.levelf_type = bld.lodf_type;
2984    if (bld.num_mips == 1) {
2985       bld.levelf_type.length = 1;
2986    }
2987    bld.leveli_type = lp_int_type(bld.levelf_type);
2988    bld.float_size_type = bld.float_size_in_type;
2989    /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
2990     * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
2991    if (bld.num_mips > 1) {
2992       bld.float_size_type.length = bld.num_mips == type.length ?
2993                                       bld.num_mips * bld.float_size_in_type.length :
2994                                       type.length;
2995    }
2996    bld.int_size_type = lp_int_type(bld.float_size_type);
2997 
2998    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
2999    lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3000    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3001    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3002    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3003    lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3004    lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3005    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3006    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3007    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3008    lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3009    lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3010    lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3011    lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3012 
3013    /* Get the dynamic state */
3014    tex_width = dynamic_state->width(dynamic_state, gallivm,
3015                                     context_ptr, texture_index);
3016    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3017                                                     context_ptr, texture_index);
3018    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3019                                                     context_ptr, texture_index);
3020    bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3021                                           context_ptr, texture_index);
3022    bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3023                                                 context_ptr, texture_index);
3024    /* Note that mip_offsets is an array[level] of offsets to texture images */
3025 
3026    if (dynamic_state->cache_ptr && thread_data_ptr) {
3027       bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3028                                            thread_data_ptr, texture_index);
3029    }
3030 
3031    /* width, height, depth as single int vector */
3032    if (dims <= 1) {
3033       bld.int_size = tex_width;
3034    }
3035    else {
3036       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3037                                             tex_width,
3038                                             LLVMConstInt(i32t, 0, 0), "");
3039       if (dims >= 2) {
3040          LLVMValueRef tex_height =
3041             dynamic_state->height(dynamic_state, gallivm,
3042                                   context_ptr, texture_index);
3043          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3044                                                tex_height,
3045                                                LLVMConstInt(i32t, 1, 0), "");
3046          if (dims >= 3) {
3047             LLVMValueRef tex_depth =
3048                dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3049                                     texture_index);
3050             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3051                                                   tex_depth,
3052                                                   LLVMConstInt(i32t, 2, 0), "");
3053          }
3054       }
3055    }
3056 
3057    for (i = 0; i < 5; i++) {
3058       newcoords[i] = coords[i];
3059    }
3060 
3061    if (util_format_is_pure_integer(static_texture_state->format) &&
3062        !util_format_has_depth(bld.format_desc) && op_is_tex &&
3063        (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3064         static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3065         static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3066       /*
3067        * Bail if impossible filtering is specified (the awkard additional
3068        * depth check is because it is legal in gallium to have things like S8Z24
3069        * here which would say it's pure int despite such formats should sample
3070        * the depth component).
3071        * In GL such filters make the texture incomplete, this makes it robust
3072        * against state trackers which set this up regardless (we'd crash in the
3073        * lerp later otherwise).
3074        * At least in some apis it may be legal to use such filters with lod
3075        * queries and/or gather (at least for gather d3d10 says only the wrap
3076        * bits are really used hence filter bits are likely simply ignored).
3077        * For fetch, we don't get valid samplers either way here.
3078        */
3079       unsigned chan;
3080       LLVMValueRef zero = lp_build_zero(gallivm, type);
3081       for (chan = 0; chan < 4; chan++) {
3082          texel_out[chan] = zero;
3083       }
3084       return;
3085    }
3086 
3087    if (0) {
3088       /* For debug: no-op texture sampling */
3089       lp_build_sample_nop(gallivm,
3090                           bld.texel_type,
3091                           newcoords,
3092                           texel_out);
3093    }
3094 
3095    else if (op_type == LP_SAMPLER_OP_FETCH) {
3096       lp_build_fetch_texel(&bld, texture_index, newcoords,
3097                            lod, offsets,
3098                            texel_out);
3099    }
3100 
3101    else {
3102       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3103       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3104       boolean use_aos;
3105 
3106       use_aos = util_format_fits_8unorm(bld.format_desc) &&
3107                 op_is_tex &&
3108                 /* not sure this is strictly needed or simply impossible */
3109                 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3110                 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3111 
3112       use_aos &= bld.num_lods <= num_quads ||
3113                  derived_sampler_state.min_img_filter ==
3114                     derived_sampler_state.mag_img_filter;
3115       if (dims > 1) {
3116          use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3117          if (dims > 2) {
3118             use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3119          }
3120       }
3121       if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3122            static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3123           derived_sampler_state.seamless_cube_map &&
3124           (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3125            derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3126          /* theoretically possible with AoS filtering but not implemented (complex!) */
3127          use_aos = 0;
3128       }
3129 
3130       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3131           !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3132          debug_printf("%s: using floating point linear filtering for %s\n",
3133                       __FUNCTION__, bld.format_desc->short_name);
3134          debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
3135                       "  wraps %d  wrapt %d  wrapr %d\n",
3136                       derived_sampler_state.min_img_filter,
3137                       derived_sampler_state.mag_img_filter,
3138                       derived_sampler_state.min_mip_filter,
3139                       static_texture_state->target,
3140                       derived_sampler_state.seamless_cube_map,
3141                       derived_sampler_state.wrap_s,
3142                       derived_sampler_state.wrap_t,
3143                       derived_sampler_state.wrap_r);
3144       }
3145 
3146       lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3147                              newcoords,
3148                              derivs, lod_bias, explicit_lod,
3149                              &lod_positive, &lod, &lod_fpart,
3150                              &ilevel0, &ilevel1);
3151 
3152       if (op_is_lodq) {
3153          texel_out[0] = lod_fpart;
3154          texel_out[1] = lod;
3155          texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3156          return;
3157       }
3158 
3159       if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3160          /* The aos path doesn't do seamless filtering so simply add cube layer
3161           * to face now.
3162           */
3163          newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3164       }
3165 
3166       /*
3167        * we only try 8-wide sampling with soa or if we have AVX2
3168        * as it appears to be a loss with just AVX)
3169        */
3170       if (num_quads == 1 || !use_aos ||
3171           (util_cpu_caps.has_avx2 &&
3172            (bld.num_lods == 1 ||
3173             derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3174          if (use_aos) {
3175             /* do sampling/filtering with fixed pt arithmetic */
3176             lp_build_sample_aos(&bld, sampler_index,
3177                                 newcoords[0], newcoords[1],
3178                                 newcoords[2],
3179                                 offsets, lod_positive, lod_fpart,
3180                                 ilevel0, ilevel1,
3181                                 texel_out);
3182          }
3183 
3184          else {
3185             lp_build_sample_general(&bld, sampler_index,
3186                                     op_type == LP_SAMPLER_OP_GATHER,
3187                                     newcoords, offsets,
3188                                     lod_positive, lod_fpart,
3189                                     ilevel0, ilevel1,
3190                                     texel_out);
3191          }
3192       }
3193       else {
3194          unsigned j;
3195          struct lp_build_sample_context bld4;
3196          struct lp_type type4 = type;
3197          unsigned i;
3198          LLVMValueRef texelout4[4];
3199          LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3200 
3201          type4.length = 4;
3202 
3203          /* Setup our build context */
3204          memset(&bld4, 0, sizeof bld4);
3205          bld4.no_quad_lod = bld.no_quad_lod;
3206          bld4.no_rho_approx = bld.no_rho_approx;
3207          bld4.no_brilinear = bld.no_brilinear;
3208          bld4.gallivm = bld.gallivm;
3209          bld4.context_ptr = bld.context_ptr;
3210          bld4.static_texture_state = bld.static_texture_state;
3211          bld4.static_sampler_state = bld.static_sampler_state;
3212          bld4.dynamic_state = bld.dynamic_state;
3213          bld4.format_desc = bld.format_desc;
3214          bld4.dims = bld.dims;
3215          bld4.row_stride_array = bld.row_stride_array;
3216          bld4.img_stride_array = bld.img_stride_array;
3217          bld4.base_ptr = bld.base_ptr;
3218          bld4.mip_offsets = bld.mip_offsets;
3219          bld4.int_size = bld.int_size;
3220          bld4.cache = bld.cache;
3221 
3222          bld4.vector_width = lp_type_width(type4);
3223 
3224          bld4.float_type = lp_type_float(32);
3225          bld4.int_type = lp_type_int(32);
3226          bld4.coord_type = type4;
3227          bld4.int_coord_type = lp_int_type(type4);
3228          bld4.float_size_in_type = lp_type_float(32);
3229          bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3230          bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3231          bld4.texel_type = bld.texel_type;
3232          bld4.texel_type.length = 4;
3233 
3234          bld4.num_mips = bld4.num_lods = 1;
3235          if (bld4.no_quad_lod && bld4.no_rho_approx &&
3236              (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3237               static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3238              (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3239             bld4.num_mips = type4.length;
3240             bld4.num_lods = type4.length;
3241          }
3242          if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3243              (explicit_lod || lod_bias || derivs)) {
3244             if ((!op_is_tex && target != PIPE_BUFFER) ||
3245                 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3246                bld4.num_mips = type4.length;
3247                bld4.num_lods = type4.length;
3248             }
3249             else if (op_is_tex && min_img_filter != mag_img_filter) {
3250                bld4.num_mips = 1;
3251                bld4.num_lods = type4.length;
3252             }
3253          }
3254 
3255          /* we want native vector size to be able to use our intrinsics */
3256          bld4.lodf_type = type4;
3257          if (bld4.num_lods != type4.length) {
3258             bld4.lodf_type.length = 1;
3259          }
3260          bld4.lodi_type = lp_int_type(bld4.lodf_type);
3261          bld4.levelf_type = type4;
3262          if (bld4.num_mips != type4.length) {
3263             bld4.levelf_type.length = 1;
3264          }
3265          bld4.leveli_type = lp_int_type(bld4.levelf_type);
3266          bld4.float_size_type = bld4.float_size_in_type;
3267          if (bld4.num_mips > 1) {
3268             bld4.float_size_type.length = bld4.num_mips == type4.length ?
3269                                             bld4.num_mips * bld4.float_size_in_type.length :
3270                                             type4.length;
3271          }
3272          bld4.int_size_type = lp_int_type(bld4.float_size_type);
3273 
3274          lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3275          lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3276          lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3277          lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3278          lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3279          lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3280          lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3281          lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3282          lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3283          lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3284          lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3285          lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3286          lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3287          lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3288 
3289          for (i = 0; i < num_quads; i++) {
3290             LLVMValueRef s4, t4, r4;
3291             LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3292             LLVMValueRef ilevel04, ilevel14 = NULL;
3293             LLVMValueRef offsets4[4] = { NULL };
3294             unsigned num_lods = bld4.num_lods;
3295 
3296             s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3297             t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3298             r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3299 
3300             if (offsets[0]) {
3301                offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3302                if (dims > 1) {
3303                   offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3304                   if (dims > 2) {
3305                      offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3306                   }
3307                }
3308             }
3309             lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3310             ilevel04 = bld.num_mips == 1 ? ilevel0 :
3311                           lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3312             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3313                ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3314                lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3315             }
3316 
3317             if (use_aos) {
3318                /* do sampling/filtering with fixed pt arithmetic */
3319                lp_build_sample_aos(&bld4, sampler_index,
3320                                    s4, t4, r4, offsets4,
3321                                    lod_positive4, lod_fpart4,
3322                                    ilevel04, ilevel14,
3323                                    texelout4);
3324             }
3325 
3326             else {
3327                /* this path is currently unreachable and hence might break easily... */
3328                LLVMValueRef newcoords4[5];
3329                newcoords4[0] = s4;
3330                newcoords4[1] = t4;
3331                newcoords4[2] = r4;
3332                newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3333                newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3334 
3335                lp_build_sample_general(&bld4, sampler_index,
3336                                        op_type == LP_SAMPLER_OP_GATHER,
3337                                        newcoords4, offsets4,
3338                                        lod_positive4, lod_fpart4,
3339                                        ilevel04, ilevel14,
3340                                        texelout4);
3341             }
3342             for (j = 0; j < 4; j++) {
3343                texelouttmp[j][i] = texelout4[j];
3344             }
3345          }
3346 
3347          for (j = 0; j < 4; j++) {
3348             texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3349          }
3350       }
3351    }
3352 
3353    if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3354       apply_sampler_swizzle(&bld, texel_out);
3355    }
3356 
3357    /*
3358     * texel type can be a (32bit) int/uint (for pure int formats only),
3359     * however we are expected to always return floats (storage is untyped).
3360     */
3361    if (!bld.texel_type.floating) {
3362       unsigned chan;
3363       for (chan = 0; chan < 4; chan++) {
3364          texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3365                                             lp_build_vec_type(gallivm, type), "");
3366       }
3367    }
3368 }
3369 
3370 
3371 #define USE_TEX_FUNC_CALL 1
3372 
3373 #define LP_MAX_TEX_FUNC_ARGS 32
3374 
3375 static inline void
get_target_info(enum pipe_texture_target target,unsigned * num_coords,unsigned * num_derivs,unsigned * num_offsets,unsigned * layer)3376 get_target_info(enum pipe_texture_target target,
3377                 unsigned *num_coords, unsigned *num_derivs,
3378                 unsigned *num_offsets, unsigned *layer)
3379 {
3380    unsigned dims = texture_dims(target);
3381    *num_coords = dims;
3382    *num_offsets = dims;
3383    *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3384                   target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3385    *layer = has_layer_coord(target) ? 2: 0;
3386    if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3387       /*
3388        * dims doesn't include r coord for cubes - this is handled
3389        * by layer instead, but need to fix up for cube arrays...
3390        */
3391       *layer = 3;
3392       *num_coords = 3;
3393    }
3394 }
3395 
3396 
3397 /**
3398  * Generate the function body for a texture sampling function.
3399  */
3400 static void
lp_build_sample_gen_func(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct lp_type type,unsigned texture_index,unsigned sampler_index,LLVMValueRef function,unsigned num_args,unsigned sample_key)3401 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3402                          const struct lp_static_texture_state *static_texture_state,
3403                          const struct lp_static_sampler_state *static_sampler_state,
3404                          struct lp_sampler_dynamic_state *dynamic_state,
3405                          struct lp_type type,
3406                          unsigned texture_index,
3407                          unsigned sampler_index,
3408                          LLVMValueRef function,
3409                          unsigned num_args,
3410                          unsigned sample_key)
3411 {
3412    LLVMBuilderRef old_builder;
3413    LLVMBasicBlockRef block;
3414    LLVMValueRef coords[5];
3415    LLVMValueRef offsets[3] = { NULL };
3416    LLVMValueRef lod = NULL;
3417    LLVMValueRef context_ptr;
3418    LLVMValueRef thread_data_ptr = NULL;
3419    LLVMValueRef texel_out[4];
3420    struct lp_derivatives derivs;
3421    struct lp_derivatives *deriv_ptr = NULL;
3422    unsigned num_param = 0;
3423    unsigned i, num_coords, num_derivs, num_offsets, layer;
3424    enum lp_sampler_lod_control lod_control;
3425    boolean need_cache = FALSE;
3426 
3427    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3428                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3429 
3430    get_target_info(static_texture_state->target,
3431                    &num_coords, &num_derivs, &num_offsets, &layer);
3432 
3433    if (dynamic_state->cache_ptr) {
3434       const struct util_format_description *format_desc;
3435       format_desc = util_format_description(static_texture_state->format);
3436       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3437          need_cache = TRUE;
3438       }
3439    }
3440 
3441    /* "unpack" arguments */
3442    context_ptr = LLVMGetParam(function, num_param++);
3443    if (need_cache) {
3444       thread_data_ptr = LLVMGetParam(function, num_param++);
3445    }
3446    for (i = 0; i < num_coords; i++) {
3447       coords[i] = LLVMGetParam(function, num_param++);
3448    }
3449    for (i = num_coords; i < 5; i++) {
3450       /* This is rather unfortunate... */
3451       coords[i] = lp_build_undef(gallivm, type);
3452    }
3453    if (layer) {
3454       coords[layer] = LLVMGetParam(function, num_param++);
3455    }
3456    if (sample_key & LP_SAMPLER_SHADOW) {
3457       coords[4] = LLVMGetParam(function, num_param++);
3458    }
3459    if (sample_key & LP_SAMPLER_OFFSETS) {
3460       for (i = 0; i < num_offsets; i++) {
3461          offsets[i] = LLVMGetParam(function, num_param++);
3462       }
3463    }
3464    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3465        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3466       lod = LLVMGetParam(function, num_param++);
3467    }
3468    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3469       for (i = 0; i < num_derivs; i++) {
3470          derivs.ddx[i] = LLVMGetParam(function, num_param++);
3471          derivs.ddy[i] = LLVMGetParam(function, num_param++);
3472       }
3473       deriv_ptr = &derivs;
3474    }
3475 
3476    assert(num_args == num_param);
3477 
3478    /*
3479     * Function body
3480     */
3481 
3482    old_builder = gallivm->builder;
3483    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3484    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
3485    LLVMPositionBuilderAtEnd(gallivm->builder, block);
3486 
3487    lp_build_sample_soa_code(gallivm,
3488                             static_texture_state,
3489                             static_sampler_state,
3490                             dynamic_state,
3491                             type,
3492                             sample_key,
3493                             texture_index,
3494                             sampler_index,
3495                             context_ptr,
3496                             thread_data_ptr,
3497                             coords,
3498                             offsets,
3499                             deriv_ptr,
3500                             lod,
3501                             texel_out);
3502 
3503    LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
3504 
3505    LLVMDisposeBuilder(gallivm->builder);
3506    gallivm->builder = old_builder;
3507 
3508    gallivm_verify_function(gallivm, function);
3509 }
3510 
3511 
3512 /**
3513  * Call the matching function for texture sampling.
3514  * If there's no match, generate a new one.
3515  */
3516 static void
lp_build_sample_soa_func(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,const struct lp_sampler_params * params)3517 lp_build_sample_soa_func(struct gallivm_state *gallivm,
3518                          const struct lp_static_texture_state *static_texture_state,
3519                          const struct lp_static_sampler_state *static_sampler_state,
3520                          struct lp_sampler_dynamic_state *dynamic_state,
3521                          const struct lp_sampler_params *params)
3522 {
3523    LLVMBuilderRef builder = gallivm->builder;
3524    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
3525                              LLVMGetInsertBlock(builder)));
3526    LLVMValueRef function, inst;
3527    LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
3528    LLVMBasicBlockRef bb;
3529    LLVMValueRef tex_ret;
3530    unsigned num_args = 0;
3531    char func_name[64];
3532    unsigned i, num_coords, num_derivs, num_offsets, layer;
3533    unsigned texture_index = params->texture_index;
3534    unsigned sampler_index = params->sampler_index;
3535    unsigned sample_key = params->sample_key;
3536    const LLVMValueRef *coords = params->coords;
3537    const LLVMValueRef *offsets = params->offsets;
3538    const struct lp_derivatives *derivs = params->derivs;
3539    enum lp_sampler_lod_control lod_control;
3540    boolean need_cache = FALSE;
3541 
3542    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3543                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3544 
3545    get_target_info(static_texture_state->target,
3546                    &num_coords, &num_derivs, &num_offsets, &layer);
3547 
3548    if (dynamic_state->cache_ptr) {
3549       const struct util_format_description *format_desc;
3550       format_desc = util_format_description(static_texture_state->format);
3551       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3552          /*
3553           * This is not 100% correct, if we have cache but the
3554           * util_format_s3tc_prefer is true the cache won't get used
3555           * regardless (could hook up the block decode there...) */
3556          need_cache = TRUE;
3557       }
3558    }
3559    /*
3560     * texture function matches are found by name.
3561     * Thus the name has to include both the texture and sampler unit
3562     * (which covers all static state) plus the actual texture function
3563     * (including things like offsets, shadow coord, lod control).
3564     * Additionally lod_property has to be included too.
3565     */
3566 
3567    util_snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
3568                  texture_index, sampler_index, sample_key);
3569 
3570    function = LLVMGetNamedFunction(module, func_name);
3571 
3572    if(!function) {
3573       LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
3574       LLVMTypeRef ret_type;
3575       LLVMTypeRef function_type;
3576       LLVMTypeRef val_type[4];
3577       unsigned num_param = 0;
3578 
3579       /*
3580        * Generate the function prototype.
3581        */
3582 
3583       arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
3584       if (need_cache) {
3585          arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
3586       }
3587       for (i = 0; i < num_coords; i++) {
3588          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3589          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
3590       }
3591       if (layer) {
3592          arg_types[num_param++] = LLVMTypeOf(coords[layer]);
3593          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
3594       }
3595       if (sample_key & LP_SAMPLER_SHADOW) {
3596          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3597       }
3598       if (sample_key & LP_SAMPLER_OFFSETS) {
3599          for (i = 0; i < num_offsets; i++) {
3600             arg_types[num_param++] = LLVMTypeOf(offsets[0]);
3601             assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
3602          }
3603       }
3604       if (lod_control == LP_SAMPLER_LOD_BIAS ||
3605           lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3606          arg_types[num_param++] = LLVMTypeOf(params->lod);
3607       }
3608       else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3609          for (i = 0; i < num_derivs; i++) {
3610             arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
3611             arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
3612             assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
3613             assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
3614          }
3615       }
3616 
3617       val_type[0] = val_type[1] = val_type[2] = val_type[3] =
3618          lp_build_vec_type(gallivm, params->type);
3619       ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
3620       function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
3621       function = LLVMAddFunction(module, func_name, function_type);
3622 
3623       for (i = 0; i < num_param; ++i) {
3624          if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
3625 
3626             lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3627          }
3628       }
3629 
3630       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
3631       LLVMSetLinkage(function, LLVMInternalLinkage);
3632 
3633       lp_build_sample_gen_func(gallivm,
3634                                static_texture_state,
3635                                static_sampler_state,
3636                                dynamic_state,
3637                                params->type,
3638                                texture_index,
3639                                sampler_index,
3640                                function,
3641                                num_param,
3642                                sample_key);
3643    }
3644 
3645    num_args = 0;
3646    args[num_args++] = params->context_ptr;
3647    if (need_cache) {
3648       args[num_args++] = params->thread_data_ptr;
3649    }
3650    for (i = 0; i < num_coords; i++) {
3651       args[num_args++] = coords[i];
3652    }
3653    if (layer) {
3654       args[num_args++] = coords[layer];
3655    }
3656    if (sample_key & LP_SAMPLER_SHADOW) {
3657       args[num_args++] = coords[4];
3658    }
3659    if (sample_key & LP_SAMPLER_OFFSETS) {
3660       for (i = 0; i < num_offsets; i++) {
3661          args[num_args++] = offsets[i];
3662       }
3663    }
3664    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3665        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3666       args[num_args++] = params->lod;
3667    }
3668    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3669       for (i = 0; i < num_derivs; i++) {
3670          args[num_args++] = derivs->ddx[i];
3671          args[num_args++] = derivs->ddy[i];
3672       }
3673    }
3674 
3675    assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
3676 
3677    tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
3678    bb = LLVMGetInsertBlock(builder);
3679    inst = LLVMGetLastInstruction(bb);
3680    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
3681 
3682    for (i = 0; i < 4; i++) {
3683       params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
3684    }
3685 }
3686 
3687 
3688 /**
3689  * Build texture sampling code.
3690  * Either via a function call or inline it directly.
3691  */
3692 void
lp_build_sample_soa(const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct gallivm_state * gallivm,const struct lp_sampler_params * params)3693 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
3694                     const struct lp_static_sampler_state *static_sampler_state,
3695                     struct lp_sampler_dynamic_state *dynamic_state,
3696                     struct gallivm_state *gallivm,
3697                     const struct lp_sampler_params *params)
3698 {
3699    boolean use_tex_func = FALSE;
3700 
3701    /*
3702     * Do not use a function call if the sampling is "simple enough".
3703     * We define this by
3704     * a) format
3705     * b) no mips (either one level only or no mip filter)
3706     * No mips will definitely make the code smaller, though
3707     * the format requirement is a bit iffy - there's some (SoA) formats
3708     * which definitely generate less code. This does happen to catch
3709     * some important cases though which are hurt quite a bit by using
3710     * a call (though not really because of the call overhead but because
3711     * they are reusing the same texture unit with some of the same
3712     * parameters).
3713     * Ideally we'd let llvm recognize this stuff by doing IPO passes.
3714     */
3715 
3716    if (USE_TEX_FUNC_CALL) {
3717       const struct util_format_description *format_desc;
3718       boolean simple_format;
3719       boolean simple_tex;
3720       enum lp_sampler_op_type op_type;
3721       format_desc = util_format_description(static_texture_state->format);
3722       simple_format = !format_desc ||
3723                          (util_format_is_rgba8_variant(format_desc) &&
3724                           format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
3725 
3726       op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3727                     LP_SAMPLER_OP_TYPE_SHIFT;
3728       simple_tex =
3729          op_type != LP_SAMPLER_OP_TEXTURE ||
3730            ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
3731              static_texture_state->level_zero_only == TRUE) &&
3732             static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
3733 
3734       use_tex_func = format_desc && !(simple_format && simple_tex);
3735    }
3736 
3737    if (use_tex_func) {
3738       lp_build_sample_soa_func(gallivm,
3739                                static_texture_state,
3740                                static_sampler_state,
3741                                dynamic_state,
3742                                params);
3743    }
3744    else {
3745       lp_build_sample_soa_code(gallivm,
3746                                static_texture_state,
3747                                static_sampler_state,
3748                                dynamic_state,
3749                                params->type,
3750                                params->sample_key,
3751                                params->texture_index,
3752                                params->sampler_index,
3753                                params->context_ptr,
3754                                params->thread_data_ptr,
3755                                params->coords,
3756                                params->offsets,
3757                                params->derivs,
3758                                params->lod,
3759                                params->texel);
3760    }
3761 }
3762 
3763 
3764 void
lp_build_size_query_soa(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_state,struct lp_sampler_dynamic_state * dynamic_state,const struct lp_sampler_size_query_params * params)3765 lp_build_size_query_soa(struct gallivm_state *gallivm,
3766                         const struct lp_static_texture_state *static_state,
3767                         struct lp_sampler_dynamic_state *dynamic_state,
3768                         const struct lp_sampler_size_query_params *params)
3769 {
3770    LLVMValueRef lod, level = 0, size;
3771    LLVMValueRef first_level = NULL;
3772    int dims, i;
3773    boolean has_array;
3774    unsigned num_lods = 1;
3775    struct lp_build_context bld_int_vec4;
3776    LLVMValueRef context_ptr = params->context_ptr;
3777    unsigned texture_unit = params->texture_unit;
3778    unsigned target = params->target;
3779 
3780    if (static_state->format == PIPE_FORMAT_NONE) {
3781       /*
3782        * If there's nothing bound, format is NONE, and we must return
3783        * all zero as mandated by d3d10 in this case.
3784        */
3785       unsigned chan;
3786       LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
3787       for (chan = 0; chan < 4; chan++) {
3788          params->sizes_out[chan] = zero;
3789       }
3790       return;
3791    }
3792 
3793    /*
3794     * Do some sanity verification about bound texture and shader dcl target.
3795     * Not entirely sure what's possible but assume array/non-array
3796     * always compatible (probably not ok for OpenGL but d3d10 has no
3797     * distinction of arrays at the resource level).
3798     * Everything else looks bogus (though not entirely sure about rect/2d).
3799     * Currently disabled because it causes assertion failures if there's
3800     * nothing bound (or rather a dummy texture, not that this case would
3801     * return the right values).
3802     */
3803    if (0 && static_state->target != target) {
3804       if (static_state->target == PIPE_TEXTURE_1D)
3805          assert(target == PIPE_TEXTURE_1D_ARRAY);
3806       else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
3807          assert(target == PIPE_TEXTURE_1D);
3808       else if (static_state->target == PIPE_TEXTURE_2D)
3809          assert(target == PIPE_TEXTURE_2D_ARRAY);
3810       else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
3811          assert(target == PIPE_TEXTURE_2D);
3812       else if (static_state->target == PIPE_TEXTURE_CUBE)
3813          assert(target == PIPE_TEXTURE_CUBE_ARRAY);
3814       else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
3815          assert(target == PIPE_TEXTURE_CUBE);
3816       else
3817          assert(0);
3818    }
3819 
3820    dims = texture_dims(target);
3821 
3822    switch (target) {
3823    case PIPE_TEXTURE_1D_ARRAY:
3824    case PIPE_TEXTURE_2D_ARRAY:
3825    case PIPE_TEXTURE_CUBE_ARRAY:
3826       has_array = TRUE;
3827       break;
3828    default:
3829       has_array = FALSE;
3830       break;
3831    }
3832 
3833    assert(!params->int_type.floating);
3834 
3835    lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
3836 
3837    if (params->explicit_lod) {
3838       /* FIXME: this needs to honor per-element lod */
3839       lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
3840                                     lp_build_const_int32(gallivm, 0), "");
3841       first_level = dynamic_state->first_level(dynamic_state, gallivm,
3842                                                context_ptr, texture_unit);
3843       level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
3844       lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
3845    } else {
3846       lod = bld_int_vec4.zero;
3847    }
3848 
3849    size = bld_int_vec4.undef;
3850 
3851    size = LLVMBuildInsertElement(gallivm->builder, size,
3852                                  dynamic_state->width(dynamic_state, gallivm,
3853                                                       context_ptr, texture_unit),
3854                                  lp_build_const_int32(gallivm, 0), "");
3855 
3856    if (dims >= 2) {
3857       size = LLVMBuildInsertElement(gallivm->builder, size,
3858                                     dynamic_state->height(dynamic_state, gallivm,
3859                                                           context_ptr, texture_unit),
3860                                     lp_build_const_int32(gallivm, 1), "");
3861    }
3862 
3863    if (dims >= 3) {
3864       size = LLVMBuildInsertElement(gallivm->builder, size,
3865                                     dynamic_state->depth(dynamic_state, gallivm,
3866                                                          context_ptr, texture_unit),
3867                                     lp_build_const_int32(gallivm, 2), "");
3868    }
3869 
3870    size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
3871 
3872    if (has_array) {
3873       LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
3874                                                  context_ptr, texture_unit);
3875       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3876          /*
3877           * It looks like GL wants number of cubes, d3d10.1 has it undefined?
3878           * Could avoid this by passing in number of cubes instead of total
3879           * number of layers (might make things easier elsewhere too).
3880           */
3881          LLVMValueRef six = lp_build_const_int32(gallivm, 6);
3882          layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
3883       }
3884       size = LLVMBuildInsertElement(gallivm->builder, size, layers,
3885                                     lp_build_const_int32(gallivm, dims), "");
3886    }
3887 
3888    /*
3889     * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
3890     * if level is out of bounds (note this can't cover unbound texture
3891     * here, which also requires returning zero).
3892     */
3893    if (params->explicit_lod && params->is_sviewinfo) {
3894       LLVMValueRef last_level, out, out1;
3895       struct lp_build_context leveli_bld;
3896 
3897       /* everything is scalar for now */
3898       lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
3899       last_level = dynamic_state->last_level(dynamic_state, gallivm,
3900                                              context_ptr, texture_unit);
3901 
3902       out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
3903       out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
3904       out = lp_build_or(&leveli_bld, out, out1);
3905       if (num_lods == 1) {
3906          out = lp_build_broadcast_scalar(&bld_int_vec4, out);
3907       }
3908       else {
3909          /* TODO */
3910          assert(0);
3911       }
3912       size = lp_build_andnot(&bld_int_vec4, size, out);
3913    }
3914    for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
3915       params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
3916                                                 size,
3917                                                 lp_build_const_int32(gallivm, i));
3918    }
3919    if (params->is_sviewinfo) {
3920       for (; i < 4; i++) {
3921          params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
3922       }
3923    }
3924 
3925    /*
3926     * if there's no explicit_lod (buffers, rects) queries requiring nr of
3927     * mips would be illegal.
3928     */
3929    if (params->is_sviewinfo && params->explicit_lod) {
3930       struct lp_build_context bld_int_scalar;
3931       LLVMValueRef num_levels;
3932       lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
3933 
3934       if (static_state->level_zero_only) {
3935          num_levels = bld_int_scalar.one;
3936       }
3937       else {
3938          LLVMValueRef last_level;
3939 
3940          last_level = dynamic_state->last_level(dynamic_state, gallivm,
3941                                                 context_ptr, texture_unit);
3942          num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
3943          num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
3944       }
3945       params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
3946                                         num_levels);
3947    }
3948 }
3949