1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sub license, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial portions
17  * of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  **************************************************************************/
28 
29 /**
30  * @file
31  * Position and shader input interpolation.
32  *
33  * @author Jose Fonseca <jfonseca@vmware.com>
34  */
35 
36 #include "pipe/p_shader_tokens.h"
37 #include "util/u_debug.h"
38 #include "util/u_memory.h"
39 #include "util/u_math.h"
40 #include "tgsi/tgsi_scan.h"
41 #include "gallivm/lp_bld_debug.h"
42 #include "gallivm/lp_bld_const.h"
43 #include "gallivm/lp_bld_arit.h"
44 #include "gallivm/lp_bld_swizzle.h"
45 #include "gallivm/lp_bld_flow.h"
46 #include "lp_bld_interp.h"
47 
48 
49 /*
50  * The shader JIT function operates on blocks of quads.
51  * Each block has 2x2 quads and each quad has 2x2 pixels.
52  *
53  * We iterate over the quads in order 0, 1, 2, 3:
54  *
55  * #################
56  * #   |   #   |   #
57  * #---0---#---1---#
58  * #   |   #   |   #
59  * #################
60  * #   |   #   |   #
61  * #---2---#---3---#
62  * #   |   #   |   #
63  * #################
64  *
65  * If we iterate over multiple quads at once, quads 01 and 23 are processed
66  * together.
67  *
68  * Within each quad, we have four pixels which are represented in SOA
69  * order:
70  *
71  * #########
72  * # 0 | 1 #
73  * #---+---#
74  * # 2 | 3 #
75  * #########
76  *
77  * So the green channel (for example) of the four pixels is stored in
78  * a single vector register: {g0, g1, g2, g3}.
79  * The order stays the same even with multiple quads:
80  * 0 1 4 5
81  * 2 3 6 7
82  * is stored as g0..g7
83  */
84 
85 
86 /**
87  * Do one perspective divide per quad.
88  *
89  * For perspective interpolation, the final attribute value is given
90  *
91  *  a' = a/w = a * oow
92  *
93  * where
94  *
95  *  a = a0 + dadx*x + dady*y
96  *  w = w0 + dwdx*x + dwdy*y
97  *  oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
98  *
99  * Instead of computing the division per pixel, with this macro we compute the
100  * division on the upper left pixel of each quad, and use a linear
101  * approximation in the remaining pixels, given by:
102  *
103  *  da'dx = (dadx - dwdx*a)*oow
104  *  da'dy = (dady - dwdy*a)*oow
105  *
106  * Ironically, this actually makes things slower -- probably because the
107  * divide hardware unit is rarely used, whereas the multiply unit is typically
108  * already saturated.
109  */
110 #define PERSPECTIVE_DIVIDE_PER_QUAD 0
111 
112 
113 static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
114 static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
115 
116 
117 static void
attrib_name(LLVMValueRef val,unsigned attrib,unsigned chan,const char * suffix)118 attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
119 {
120    if(attrib == 0)
121       lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
122    else
123       lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
124 }
125 
126 static void
calc_offsets(struct lp_build_context * coeff_bld,unsigned quad_start_index,LLVMValueRef * pixoffx,LLVMValueRef * pixoffy)127 calc_offsets(struct lp_build_context *coeff_bld,
128              unsigned quad_start_index,
129              LLVMValueRef *pixoffx,
130              LLVMValueRef *pixoffy)
131 {
132    unsigned i;
133    unsigned num_pix = coeff_bld->type.length;
134    struct gallivm_state *gallivm = coeff_bld->gallivm;
135    LLVMBuilderRef builder = coeff_bld->gallivm->builder;
136    LLVMValueRef nr, pixxf, pixyf;
137 
138    *pixoffx = coeff_bld->undef;
139    *pixoffy = coeff_bld->undef;
140 
141    for (i = 0; i < num_pix; i++) {
142       nr = lp_build_const_int32(gallivm, i);
143       pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
144                                    (quad_start_index & 1) * 2);
145       pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
146                                    (quad_start_index & 2));
147       *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
148       *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
149    }
150 }
151 
152 
153 /* Much easier, and significantly less instructions in the per-stamp
154  * part (less than half) but overall more instructions so a loss if
155  * most quads are active. Might be a win though with larger vectors.
156  * No ability to do per-quad divide (doable but not implemented)
157  * Could be made to work with passed in pixel offsets (i.e. active quad merging).
158  */
159 static void
coeffs_init_simple(struct lp_build_interp_soa_context * bld,LLVMValueRef a0_ptr,LLVMValueRef dadx_ptr,LLVMValueRef dady_ptr)160 coeffs_init_simple(struct lp_build_interp_soa_context *bld,
161                    LLVMValueRef a0_ptr,
162                    LLVMValueRef dadx_ptr,
163                    LLVMValueRef dady_ptr)
164 {
165    struct lp_build_context *coeff_bld = &bld->coeff_bld;
166    struct lp_build_context *setup_bld = &bld->setup_bld;
167    struct gallivm_state *gallivm = coeff_bld->gallivm;
168    LLVMBuilderRef builder = gallivm->builder;
169    unsigned attrib;
170 
171    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
172       /*
173        * always fetch all 4 values for performance/simplicity
174        * Note: we do that here because it seems to generate better
175        * code. It generates a lot of moves initially but less
176        * moves later. As far as I can tell this looks like a
177        * llvm issue, instead of simply reloading the values from
178        * the passed in pointers it if it runs out of registers
179        * it spills/reloads them. Maybe some optimization passes
180        * would help.
181        * Might want to investigate this again later.
182        */
183       const unsigned interp = bld->interp[attrib];
184       LLVMValueRef index = lp_build_const_int32(gallivm,
185                                 attrib * TGSI_NUM_CHANNELS);
186       LLVMValueRef ptr;
187       LLVMValueRef dadxaos = setup_bld->zero;
188       LLVMValueRef dadyaos = setup_bld->zero;
189       LLVMValueRef a0aos = setup_bld->zero;
190 
191       switch (interp) {
192       case LP_INTERP_PERSPECTIVE:
193          /* fall-through */
194 
195       case LP_INTERP_LINEAR:
196          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
197          ptr = LLVMBuildBitCast(builder, ptr,
198                LLVMPointerType(setup_bld->vec_type, 0), "");
199          dadxaos = LLVMBuildLoad(builder, ptr, "");
200 
201          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
202          ptr = LLVMBuildBitCast(builder, ptr,
203                LLVMPointerType(setup_bld->vec_type, 0), "");
204          dadyaos = LLVMBuildLoad(builder, ptr, "");
205 
206          attrib_name(dadxaos, attrib, 0, ".dadxaos");
207          attrib_name(dadyaos, attrib, 0, ".dadyaos");
208          /* fall-through */
209 
210       case LP_INTERP_CONSTANT:
211       case LP_INTERP_FACING:
212          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
213          ptr = LLVMBuildBitCast(builder, ptr,
214                LLVMPointerType(setup_bld->vec_type, 0), "");
215          a0aos = LLVMBuildLoad(builder, ptr, "");
216          attrib_name(a0aos, attrib, 0, ".a0aos");
217          break;
218 
219       case LP_INTERP_POSITION:
220          /* Nothing to do as the position coeffs are already setup in slot 0 */
221          continue;
222 
223       default:
224          assert(0);
225          break;
226       }
227       bld->a0aos[attrib] = a0aos;
228       bld->dadxaos[attrib] = dadxaos;
229       bld->dadyaos[attrib] = dadyaos;
230    }
231 }
232 
233 /**
234  * Interpolate the shader input attribute values.
235  * This is called for each (group of) quad(s).
236  */
237 static void
attribs_update_simple(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,int quad_start_index,LLVMValueRef loop_iter,int start,int end)238 attribs_update_simple(struct lp_build_interp_soa_context *bld,
239                       struct gallivm_state *gallivm,
240                       int quad_start_index,
241                       LLVMValueRef loop_iter,
242                       int start,
243                       int end)
244 {
245    LLVMBuilderRef builder = gallivm->builder;
246    struct lp_build_context *coeff_bld = &bld->coeff_bld;
247    struct lp_build_context *setup_bld = &bld->setup_bld;
248    LLVMValueRef oow = NULL;
249    unsigned attrib;
250    LLVMValueRef pixoffx;
251    LLVMValueRef pixoffy;
252 
253    /* could do this with code-generated passed in pixel offsets too */
254    if (bld->dynamic_offsets) {
255       LLVMValueRef ptr;
256 
257       assert(loop_iter);
258       ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
259       pixoffx = LLVMBuildLoad(builder, ptr, "");
260       ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
261       pixoffy = LLVMBuildLoad(builder, ptr, "");
262    }
263    else {
264       calc_offsets(coeff_bld, quad_start_index, &pixoffx, &pixoffy);
265    }
266 
267    pixoffx = LLVMBuildFAdd(builder, pixoffx,
268                            lp_build_broadcast_scalar(coeff_bld, bld->x), "");
269    pixoffy = LLVMBuildFAdd(builder, pixoffy,
270                            lp_build_broadcast_scalar(coeff_bld, bld->y), "");
271 
272    for (attrib = start; attrib < end; attrib++) {
273       const unsigned mask = bld->mask[attrib];
274       const unsigned interp = bld->interp[attrib];
275       unsigned chan;
276 
277       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
278          if (mask & (1 << chan)) {
279             LLVMValueRef index;
280             LLVMValueRef dadx = coeff_bld->zero;
281             LLVMValueRef dady = coeff_bld->zero;
282             LLVMValueRef a = coeff_bld->zero;
283 
284             index = lp_build_const_int32(gallivm, chan);
285             switch (interp) {
286             case LP_INTERP_PERSPECTIVE:
287                /* fall-through */
288 
289             case LP_INTERP_LINEAR:
290                if (attrib == 0 && chan == 0) {
291                   dadx = coeff_bld->one;
292                }
293                else if (attrib == 0 && chan == 1) {
294                   dady = coeff_bld->one;
295                }
296                else {
297                   dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
298                                                     coeff_bld->type, bld->dadxaos[attrib],
299                                                     index);
300                   dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
301                                                     coeff_bld->type, bld->dadyaos[attrib],
302                                                     index);
303                   a = lp_build_extract_broadcast(gallivm, setup_bld->type,
304                                                  coeff_bld->type, bld->a0aos[attrib],
305                                                  index);
306                }
307                /*
308                 * a = a0 + (x * dadx + y * dady)
309                 */
310                dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
311                dady = LLVMBuildFMul(builder, dady, pixoffy, "");
312                a = LLVMBuildFAdd(builder, a, dadx, "");
313                a = LLVMBuildFAdd(builder, a, dady, "");
314 
315                if (interp == LP_INTERP_PERSPECTIVE) {
316                   if (oow == NULL) {
317                      LLVMValueRef w = bld->attribs[0][3];
318                      assert(attrib != 0);
319                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
320                      oow = lp_build_rcp(coeff_bld, w);
321                   }
322                   a = lp_build_mul(coeff_bld, a, oow);
323                }
324                break;
325 
326             case LP_INTERP_CONSTANT:
327             case LP_INTERP_FACING:
328                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
329                                               coeff_bld->type, bld->a0aos[attrib],
330                                               index);
331                break;
332 
333             case LP_INTERP_POSITION:
334                assert(attrib > 0);
335                a = bld->attribs[0][chan];
336                break;
337 
338             default:
339                assert(0);
340                break;
341             }
342 
343             if ((attrib == 0) && (chan == 2)){
344                /* FIXME: Depth values can exceed 1.0, due to the fact that
345                 * setup interpolation coefficients refer to (0,0) which causes
346                 * precision loss. So we must clamp to 1.0 here to avoid artifacts
347                 */
348                a = lp_build_min(coeff_bld, a, coeff_bld->one);
349             }
350             bld->attribs[attrib][chan] = a;
351          }
352       }
353    }
354 }
355 
356 /**
357  * Initialize the bld->a, dadq fields.  This involves fetching
358  * those values from the arrays which are passed into the JIT function.
359  */
360 static void
coeffs_init(struct lp_build_interp_soa_context * bld,LLVMValueRef a0_ptr,LLVMValueRef dadx_ptr,LLVMValueRef dady_ptr)361 coeffs_init(struct lp_build_interp_soa_context *bld,
362             LLVMValueRef a0_ptr,
363             LLVMValueRef dadx_ptr,
364             LLVMValueRef dady_ptr)
365 {
366    struct lp_build_context *coeff_bld = &bld->coeff_bld;
367    struct lp_build_context *setup_bld = &bld->setup_bld;
368    struct gallivm_state *gallivm = coeff_bld->gallivm;
369    LLVMBuilderRef builder = gallivm->builder;
370    LLVMValueRef pixoffx, pixoffy;
371    unsigned attrib;
372    unsigned chan;
373    unsigned i;
374 
375    pixoffx = coeff_bld->undef;
376    pixoffy = coeff_bld->undef;
377    for (i = 0; i < coeff_bld->type.length; i++) {
378       LLVMValueRef nr = lp_build_const_int32(gallivm, i);
379       LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
380       LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
381       pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
382       pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
383    }
384 
385 
386    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
387       const unsigned mask = bld->mask[attrib];
388       const unsigned interp = bld->interp[attrib];
389       LLVMValueRef index = lp_build_const_int32(gallivm,
390                                 attrib * TGSI_NUM_CHANNELS);
391       LLVMValueRef ptr;
392       LLVMValueRef dadxaos = setup_bld->zero;
393       LLVMValueRef dadyaos = setup_bld->zero;
394       LLVMValueRef a0aos = setup_bld->zero;
395 
396       /* always fetch all 4 values for performance/simplicity */
397       switch (interp) {
398       case LP_INTERP_PERSPECTIVE:
399          /* fall-through */
400 
401       case LP_INTERP_LINEAR:
402          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
403          ptr = LLVMBuildBitCast(builder, ptr,
404                LLVMPointerType(setup_bld->vec_type, 0), "");
405          dadxaos = LLVMBuildLoad(builder, ptr, "");
406 
407          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
408          ptr = LLVMBuildBitCast(builder, ptr,
409                LLVMPointerType(setup_bld->vec_type, 0), "");
410          dadyaos = LLVMBuildLoad(builder, ptr, "");
411 
412          attrib_name(dadxaos, attrib, 0, ".dadxaos");
413          attrib_name(dadyaos, attrib, 0, ".dadyaos");
414          /* fall-through */
415 
416       case LP_INTERP_CONSTANT:
417       case LP_INTERP_FACING:
418          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
419          ptr = LLVMBuildBitCast(builder, ptr,
420                LLVMPointerType(setup_bld->vec_type, 0), "");
421          a0aos = LLVMBuildLoad(builder, ptr, "");
422          attrib_name(a0aos, attrib, 0, ".a0aos");
423          break;
424 
425       case LP_INTERP_POSITION:
426          /* Nothing to do as the position coeffs are already setup in slot 0 */
427          continue;
428 
429       default:
430          assert(0);
431          break;
432       }
433 
434       /*
435        * a = a0 + (x * dadx + y * dady)
436        * a0aos is the attrib value at top left corner of stamp
437        */
438       if (interp != LP_INTERP_CONSTANT &&
439           interp != LP_INTERP_FACING) {
440          LLVMValueRef axaos, ayaos;
441          axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x),
442                                dadxaos, "");
443          ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y),
444                                dadyaos, "");
445          a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, "");
446          a0aos = LLVMBuildFAdd(builder, a0aos, axaos, "");
447       }
448 
449       /*
450        * dadq = {0, dadx, dady, dadx + dady}
451        * for two quads (side by side) this is:
452        * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
453        */
454       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
455          /* this generates a CRAPLOAD of shuffles... */
456          if (mask & (1 << chan)) {
457             LLVMValueRef dadx, dady;
458             LLVMValueRef dadq, dadq2;
459             LLVMValueRef a;
460             LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
461 
462             if (attrib == 0 && chan == 0) {
463                a = lp_build_broadcast_scalar(coeff_bld, bld->x);
464                dadx = coeff_bld->one;
465                dady = coeff_bld->zero;
466             }
467             else if (attrib == 0 && chan == 1) {
468                a = lp_build_broadcast_scalar(coeff_bld, bld->y);
469                dady = coeff_bld->one;
470                dadx = coeff_bld->zero;
471             }
472             else {
473                dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
474                                               coeff_bld->type, dadxaos, chan_index);
475                dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
476                                               coeff_bld->type, dadyaos, chan_index);
477 
478                /*
479                 * a = {a, a, a, a}
480                 */
481                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
482                                               coeff_bld->type, a0aos, chan_index);
483             }
484 
485             dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
486             dady = LLVMBuildFMul(builder, dady, pixoffy, "");
487             dadq = LLVMBuildFAdd(builder, dadx, dady, "");
488 
489             /*
490              * Compute the attrib values on the upper-left corner of each
491              * group of quads.
492              * Note that if we process 2 quads at once this doesn't
493              * really exactly to what we want.
494              * We need to access elem 0 and 2 respectively later if we process
495              * 2 quads at once.
496              */
497 
498             if (interp != LP_INTERP_CONSTANT &&
499                 interp != LP_INTERP_FACING) {
500                dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
501                a = LLVMBuildFAdd(builder, a, dadq2, "");
502 	    }
503 
504 #if PERSPECTIVE_DIVIDE_PER_QUAD
505             /*
506              * a *= 1 / w
507              */
508 
509             /*
510              * XXX since we're only going to access elements 0,2 out of 8
511              * if we have 8-wide vectors we should do the division only 4-wide.
512              * a is really a 2-elements in a 4-wide vector disguised as 8-wide
513              * in this case.
514              */
515             if (interp == LP_INTERP_PERSPECTIVE) {
516                LLVMValueRef w = bld->a[0][3];
517                assert(attrib != 0);
518                assert(bld->mask[0] & TGSI_WRITEMASK_W);
519                if (!bld->oow) {
520                   bld->oow = lp_build_rcp(coeff_bld, w);
521                   lp_build_name(bld->oow, "oow");
522                }
523                a = lp_build_mul(coeff_bld, a, bld->oow);
524             }
525 #endif
526 
527             attrib_name(a, attrib, chan, ".a");
528             attrib_name(dadq, attrib, chan, ".dadq");
529 
530             if (bld->dynamic_offsets) {
531                bld->a[attrib][chan] = lp_build_alloca(gallivm,
532                                                       LLVMTypeOf(a), "");
533                LLVMBuildStore(builder, a, bld->a[attrib][chan]);
534             }
535             else {
536                bld->a[attrib][chan] = a;
537             }
538             bld->dadq[attrib][chan] = dadq;
539          }
540       }
541    }
542 }
543 
544 
545 /**
546  * Increment the shader input attribute values.
547  * This is called when we move from one quad to the next.
548  */
549 static void
attribs_update(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,int quad_start_index,LLVMValueRef loop_iter,int start,int end)550 attribs_update(struct lp_build_interp_soa_context *bld,
551                struct gallivm_state *gallivm,
552                int quad_start_index,
553                LLVMValueRef loop_iter,
554                int start,
555                int end)
556 {
557    LLVMBuilderRef builder = gallivm->builder;
558    struct lp_build_context *coeff_bld = &bld->coeff_bld;
559    LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_start_index);
560    LLVMValueRef oow = NULL;
561    unsigned attrib;
562    unsigned chan;
563 
564    assert(quad_start_index < 4);
565 
566    for(attrib = start; attrib < end; ++attrib) {
567       const unsigned mask = bld->mask[attrib];
568       const unsigned interp = bld->interp[attrib];
569       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
570          if(mask & (1 << chan)) {
571             LLVMValueRef a;
572             if (interp == LP_INTERP_CONSTANT ||
573                 interp == LP_INTERP_FACING) {
574                a = bld->a[attrib][chan];
575                if (bld->dynamic_offsets) {
576                   a = LLVMBuildLoad(builder, a, "");
577                }
578             }
579             else if (interp == LP_INTERP_POSITION) {
580                assert(attrib > 0);
581                a = bld->attribs[0][chan];
582             }
583             else {
584                LLVMValueRef dadq;
585 
586                a = bld->a[attrib][chan];
587 
588                /*
589                 * Broadcast the attribute value for this quad into all elements
590                 */
591 
592                if (bld->dynamic_offsets) {
593                   /* stored as vector load as float */
594                   LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
595                                                             gallivm->context), 0);
596                   LLVMValueRef ptr;
597                   a = LLVMBuildBitCast(builder, a, ptr_type, "");
598                   ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
599                   a = LLVMBuildLoad(builder, ptr, "");
600                   a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
601                }
602                else {
603                   a = LLVMBuildShuffleVector(builder,
604                                              a, coeff_bld->undef, shuffle, "");
605                }
606 
607                /*
608                 * Get the derivatives.
609                 */
610 
611                dadq = bld->dadq[attrib][chan];
612 
613 #if PERSPECTIVE_DIVIDE_PER_QUAD
614                if (interp == LP_INTERP_PERSPECTIVE) {
615                   LLVMValueRef dwdq = bld->dadq[0][3];
616 
617                   if (oow == NULL) {
618                      assert(bld->oow);
619                      oow = LLVMBuildShuffleVector(coeff_bld->builder,
620                                                   bld->oow, coeff_bld->undef,
621                                                   shuffle, "");
622                   }
623 
624                   dadq = lp_build_sub(coeff_bld,
625                                       dadq,
626                                       lp_build_mul(coeff_bld, a, dwdq));
627                   dadq = lp_build_mul(coeff_bld, dadq, oow);
628                }
629 #endif
630 
631                /*
632                 * Add the derivatives
633                 */
634 
635                a = lp_build_add(coeff_bld, a, dadq);
636 
637 #if !PERSPECTIVE_DIVIDE_PER_QUAD
638                if (interp == LP_INTERP_PERSPECTIVE) {
639                   if (oow == NULL) {
640                      LLVMValueRef w = bld->attribs[0][3];
641                      assert(attrib != 0);
642                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
643                      oow = lp_build_rcp(coeff_bld, w);
644                   }
645                   a = lp_build_mul(coeff_bld, a, oow);
646                }
647 #endif
648 
649                if (attrib == 0 && chan == 2) {
650                   /* FIXME: Depth values can exceed 1.0, due to the fact that
651                    * setup interpolation coefficients refer to (0,0) which causes
652                    * precision loss. So we must clamp to 1.0 here to avoid artifacts
653                    */
654                   a = lp_build_min(coeff_bld, a, coeff_bld->one);
655                }
656 
657                attrib_name(a, attrib, chan, "");
658             }
659             bld->attribs[attrib][chan] = a;
660          }
661       }
662    }
663 }
664 
665 
666 /**
667  * Generate the position vectors.
668  *
669  * Parameter x0, y0 are the integer values with upper left coordinates.
670  */
671 static void
pos_init(struct lp_build_interp_soa_context * bld,LLVMValueRef x0,LLVMValueRef y0)672 pos_init(struct lp_build_interp_soa_context *bld,
673          LLVMValueRef x0,
674          LLVMValueRef y0)
675 {
676    LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
677    struct lp_build_context *coeff_bld = &bld->coeff_bld;
678 
679    bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
680    bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
681 }
682 
683 
684 /**
685  * Initialize fragment shader input attribute info.
686  */
687 void
lp_build_interp_soa_init(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,unsigned num_inputs,const struct lp_shader_input * inputs,LLVMBuilderRef builder,struct lp_type type,boolean dynamic_offsets,LLVMValueRef a0_ptr,LLVMValueRef dadx_ptr,LLVMValueRef dady_ptr,LLVMValueRef x0,LLVMValueRef y0)688 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
689                          struct gallivm_state *gallivm,
690                          unsigned num_inputs,
691                          const struct lp_shader_input *inputs,
692                          LLVMBuilderRef builder,
693                          struct lp_type type,
694                          boolean dynamic_offsets,
695                          LLVMValueRef a0_ptr,
696                          LLVMValueRef dadx_ptr,
697                          LLVMValueRef dady_ptr,
698                          LLVMValueRef x0,
699                          LLVMValueRef y0)
700 {
701    struct lp_type coeff_type;
702    struct lp_type setup_type;
703    unsigned attrib;
704    unsigned chan;
705 
706    memset(bld, 0, sizeof *bld);
707 
708    memset(&coeff_type, 0, sizeof coeff_type);
709    coeff_type.floating = TRUE;
710    coeff_type.sign = TRUE;
711    coeff_type.width = 32;
712    coeff_type.length = type.length;
713 
714    memset(&setup_type, 0, sizeof setup_type);
715    setup_type.floating = TRUE;
716    setup_type.sign = TRUE;
717    setup_type.width = 32;
718    setup_type.length = TGSI_NUM_CHANNELS;
719 
720 
721    /* XXX: we don't support interpolating into any other types */
722    assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
723 
724    lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
725    lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
726 
727    /* For convenience */
728    bld->pos = bld->attribs[0];
729    bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
730 
731    /* Position */
732    bld->mask[0] = TGSI_WRITEMASK_XYZW;
733    bld->interp[0] = LP_INTERP_LINEAR;
734 
735    /* Inputs */
736    for (attrib = 0; attrib < num_inputs; ++attrib) {
737       bld->mask[1 + attrib] = inputs[attrib].usage_mask;
738       bld->interp[1 + attrib] = inputs[attrib].interp;
739    }
740    bld->num_attribs = 1 + num_inputs;
741 
742    /* Ensure all masked out input channels have a valid value */
743    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
744       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
745          bld->attribs[attrib][chan] = bld->coeff_bld.undef;
746       }
747    }
748 
749    pos_init(bld, x0, y0);
750 
751    if (coeff_type.length > 4) {
752       bld->simple_interp = TRUE;
753       if (dynamic_offsets) {
754          /* XXX this should use a global static table */
755          unsigned i;
756          unsigned num_loops = 16 / type.length;
757          LLVMValueRef pixoffx, pixoffy, index;
758          LLVMValueRef ptr;
759 
760          bld->dynamic_offsets = TRUE;
761          bld->xoffset_store = lp_build_array_alloca(gallivm,
762                                                     lp_build_vec_type(gallivm, type),
763                                                     lp_build_const_int32(gallivm, num_loops),
764                                                     "");
765          bld->yoffset_store = lp_build_array_alloca(gallivm,
766                                                     lp_build_vec_type(gallivm, type),
767                                                     lp_build_const_int32(gallivm, num_loops),
768                                                     "");
769          for (i = 0; i < num_loops; i++) {
770             index = lp_build_const_int32(gallivm, i);
771             calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
772             ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
773             LLVMBuildStore(builder, pixoffx, ptr);
774             ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
775             LLVMBuildStore(builder, pixoffy, ptr);
776          }
777       }
778       coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
779    }
780    else {
781       bld->simple_interp = FALSE;
782       if (dynamic_offsets) {
783          bld->dynamic_offsets = TRUE;
784       }
785       coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
786    }
787 
788 }
789 
790 
791 /**
792  * Advance the position and inputs to the given quad within the block.
793  */
794 void
lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,int quad_start_index)795 lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
796                                   struct gallivm_state *gallivm,
797                                   int quad_start_index)
798 {
799    assert(quad_start_index < 4);
800 
801    if (bld->simple_interp) {
802       attribs_update_simple(bld, gallivm, quad_start_index, NULL, 1, bld->num_attribs);
803    }
804    else {
805       attribs_update(bld, gallivm, quad_start_index, NULL, 1, bld->num_attribs);
806    }
807 }
808 
809 void
lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,int quad_start_index)810 lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
811                                struct gallivm_state *gallivm,
812                                int quad_start_index)
813 {
814    assert(quad_start_index < 4);
815 
816    if (bld->simple_interp) {
817       attribs_update_simple(bld, gallivm, quad_start_index, NULL, 0, 1);
818    }
819    else {
820       attribs_update(bld, gallivm, quad_start_index, NULL, 0, 1);
821    }
822 }
823 
824 void
lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,LLVMValueRef quad_start_index)825 lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
826                                       struct gallivm_state *gallivm,
827                                       LLVMValueRef quad_start_index)
828 {
829    if (bld->simple_interp) {
830       attribs_update_simple(bld, gallivm, 0, quad_start_index, 1, bld->num_attribs);
831    }
832    else {
833       attribs_update(bld, gallivm, 0, quad_start_index, 1, bld->num_attribs);
834    }
835 }
836 
837 void
lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,LLVMValueRef quad_start_index)838 lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
839                                    struct gallivm_state *gallivm,
840                                    LLVMValueRef quad_start_index)
841 {
842    if (bld->simple_interp) {
843       attribs_update_simple(bld, gallivm, 0, quad_start_index, 0, 1);
844    }
845    else {
846       attribs_update(bld, gallivm, 0, quad_start_index, 0, 1);
847    }
848 }
849 
850