1 /**************************************************************************
2  *
3  * Copyright 2009-2010 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Helper
32  *
33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
34  * notably min/max and saturated operations), and it is often necessary to
35  * resort machine-specific intrinsics directly. The functions here hide all
36  * these implementation details from the other modules.
37  *
38  * We also do simple expressions simplification here. Reasons are:
39  * - it is very easy given we have all necessary information readily available
40  * - LLVM optimization passes fail to simplify several vector expressions
41  * - We often know value constraints which the optimization passes have no way
42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
43  *
44  * @author Jose Fonseca <jfonseca@vmware.com>
45  */
46 
47 
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53 
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_intr.h"
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
62 
63 
64 #define EXP_POLY_DEGREE 5
65 
66 #define LOG_POLY_DEGREE 4
67 
68 
69 /**
70  * Generate min(a, b)
71  * No checks for special case values of a or b = 1 or 0 are done.
72  */
73 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)74 lp_build_min_simple(struct lp_build_context *bld,
75                     LLVMValueRef a,
76                     LLVMValueRef b)
77 {
78    const struct lp_type type = bld->type;
79    const char *intrinsic = NULL;
80    unsigned intr_size = 0;
81    LLVMValueRef cond;
82 
83    assert(lp_check_value(type, a));
84    assert(lp_check_value(type, b));
85 
86    /* TODO: optimize the constant case */
87 
88    if (type.floating && util_cpu_caps.has_sse) {
89       if (type.width == 32) {
90          if (type.length == 1) {
91             intrinsic = "llvm.x86.sse.min.ss";
92             intr_size = 128;
93          }
94          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
95             intrinsic = "llvm.x86.sse.min.ps";
96             intr_size = 128;
97          }
98          else {
99             intrinsic = "llvm.x86.avx.min.ps.256";
100             intr_size = 256;
101          }
102       }
103       if (type.width == 64 && util_cpu_caps.has_sse2) {
104          if (type.length == 1) {
105             intrinsic = "llvm.x86.sse2.min.sd";
106             intr_size = 128;
107          }
108          else if (type.length == 2 || !util_cpu_caps.has_avx) {
109             intrinsic = "llvm.x86.sse2.min.pd";
110             intr_size = 128;
111          }
112          else {
113             intrinsic = "llvm.x86.avx.min.pd.256";
114             intr_size = 256;
115          }
116       }
117    }
118    else if (util_cpu_caps.has_sse2 && type.length >= 2) {
119       intr_size = 128;
120       if ((type.width == 8 || type.width == 16) &&
121           (type.width * type.length <= 64) &&
122           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
123          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
124                       __FUNCTION__);
125          }
126       if (type.width == 8 && !type.sign) {
127          intrinsic = "llvm.x86.sse2.pminu.b";
128       }
129       else if (type.width == 16 && type.sign) {
130          intrinsic = "llvm.x86.sse2.pmins.w";
131       }
132       if (util_cpu_caps.has_sse4_1) {
133          if (type.width == 8 && type.sign) {
134             intrinsic = "llvm.x86.sse41.pminsb";
135          }
136          if (type.width == 16 && !type.sign) {
137             intrinsic = "llvm.x86.sse41.pminuw";
138          }
139          if (type.width == 32 && !type.sign) {
140             intrinsic = "llvm.x86.sse41.pminud";
141         }
142          if (type.width == 32 && type.sign) {
143             intrinsic = "llvm.x86.sse41.pminsd";
144          }
145       }
146    }
147 
148    if(intrinsic) {
149       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
150                                                  type,
151                                                  intr_size, a, b);
152    }
153 
154    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
155    return lp_build_select(bld, cond, a, b);
156 }
157 
158 
159 /**
160  * Generate max(a, b)
161  * No checks for special case values of a or b = 1 or 0 are done.
162  */
163 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)164 lp_build_max_simple(struct lp_build_context *bld,
165                     LLVMValueRef a,
166                     LLVMValueRef b)
167 {
168    const struct lp_type type = bld->type;
169    const char *intrinsic = NULL;
170    unsigned intr_size = 0;
171    LLVMValueRef cond;
172 
173    assert(lp_check_value(type, a));
174    assert(lp_check_value(type, b));
175 
176    /* TODO: optimize the constant case */
177 
178    if (type.floating && util_cpu_caps.has_sse) {
179       if (type.width == 32) {
180          if (type.length == 1) {
181             intrinsic = "llvm.x86.sse.max.ss";
182             intr_size = 128;
183          }
184          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
185             intrinsic = "llvm.x86.sse.max.ps";
186             intr_size = 128;
187          }
188          else {
189             intrinsic = "llvm.x86.avx.max.ps.256";
190             intr_size = 256;
191          }
192       }
193       if (type.width == 64 && util_cpu_caps.has_sse2) {
194          if (type.length == 1) {
195             intrinsic = "llvm.x86.sse2.max.sd";
196             intr_size = 128;
197          }
198          else if (type.length == 2 || !util_cpu_caps.has_avx) {
199             intrinsic = "llvm.x86.sse2.max.pd";
200             intr_size = 128;
201          }
202          else {
203             intrinsic = "llvm.x86.avx.max.pd.256";
204             intr_size = 256;
205          }
206       }
207    }
208    else if (util_cpu_caps.has_sse2 && type.length >= 2) {
209       intr_size = 128;
210       if ((type.width == 8 || type.width == 16) &&
211           (type.width * type.length <= 64) &&
212           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
213          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
214                       __FUNCTION__);
215          }
216       if (type.width == 8 && !type.sign) {
217          intrinsic = "llvm.x86.sse2.pmaxu.b";
218          intr_size = 128;
219       }
220       else if (type.width == 16 && type.sign) {
221          intrinsic = "llvm.x86.sse2.pmaxs.w";
222       }
223       if (util_cpu_caps.has_sse4_1) {
224          if (type.width == 8 && type.sign) {
225             intrinsic = "llvm.x86.sse41.pmaxsb";
226          }
227          if (type.width == 16 && !type.sign) {
228             intrinsic = "llvm.x86.sse41.pmaxuw";
229          }
230          if (type.width == 32 && !type.sign) {
231             intrinsic = "llvm.x86.sse41.pmaxud";
232         }
233          if (type.width == 32 && type.sign) {
234             intrinsic = "llvm.x86.sse41.pmaxsd";
235          }
236       }
237    }
238 
239    if(intrinsic) {
240       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
241                                                  type,
242                                                  intr_size, a, b);
243    }
244 
245    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
246    return lp_build_select(bld, cond, a, b);
247 }
248 
249 
250 /**
251  * Generate 1 - a, or ~a depending on bld->type.
252  */
253 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)254 lp_build_comp(struct lp_build_context *bld,
255               LLVMValueRef a)
256 {
257    LLVMBuilderRef builder = bld->gallivm->builder;
258    const struct lp_type type = bld->type;
259 
260    assert(lp_check_value(type, a));
261 
262    if(a == bld->one)
263       return bld->zero;
264    if(a == bld->zero)
265       return bld->one;
266 
267    if(type.norm && !type.floating && !type.fixed && !type.sign) {
268       if(LLVMIsConstant(a))
269          return LLVMConstNot(a);
270       else
271          return LLVMBuildNot(builder, a, "");
272    }
273 
274    if(LLVMIsConstant(a))
275       if (type.floating)
276           return LLVMConstFSub(bld->one, a);
277       else
278           return LLVMConstSub(bld->one, a);
279    else
280       if (type.floating)
281          return LLVMBuildFSub(builder, bld->one, a, "");
282       else
283          return LLVMBuildSub(builder, bld->one, a, "");
284 }
285 
286 
287 /**
288  * Generate a + b
289  */
290 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)291 lp_build_add(struct lp_build_context *bld,
292              LLVMValueRef a,
293              LLVMValueRef b)
294 {
295    LLVMBuilderRef builder = bld->gallivm->builder;
296    const struct lp_type type = bld->type;
297    LLVMValueRef res;
298 
299    assert(lp_check_value(type, a));
300    assert(lp_check_value(type, b));
301 
302    if(a == bld->zero)
303       return b;
304    if(b == bld->zero)
305       return a;
306    if(a == bld->undef || b == bld->undef)
307       return bld->undef;
308 
309    if(bld->type.norm) {
310       const char *intrinsic = NULL;
311 
312       if(a == bld->one || b == bld->one)
313         return bld->one;
314 
315       if(util_cpu_caps.has_sse2 &&
316          type.width * type.length == 128 &&
317          !type.floating && !type.fixed) {
318          if(type.width == 8)
319             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
320          if(type.width == 16)
321             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
322       }
323 
324       if(intrinsic)
325          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
326    }
327 
328    if(LLVMIsConstant(a) && LLVMIsConstant(b))
329       if (type.floating)
330          res = LLVMConstFAdd(a, b);
331       else
332          res = LLVMConstAdd(a, b);
333    else
334       if (type.floating)
335          res = LLVMBuildFAdd(builder, a, b, "");
336       else
337          res = LLVMBuildAdd(builder, a, b, "");
338 
339    /* clamp to ceiling of 1.0 */
340    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
341       res = lp_build_min_simple(bld, res, bld->one);
342 
343    /* XXX clamp to floor of -1 or 0??? */
344 
345    return res;
346 }
347 
348 
349 /** Return the scalar sum of the elements of a.
350  * Should avoid this operation whenever possible.
351  */
352 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)353 lp_build_horizontal_add(struct lp_build_context *bld,
354                         LLVMValueRef a)
355 {
356    LLVMBuilderRef builder = bld->gallivm->builder;
357    const struct lp_type type = bld->type;
358    LLVMValueRef index, res;
359    unsigned i, length;
360    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
361    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
362    LLVMValueRef vecres, elem2;
363 
364    assert(lp_check_value(type, a));
365 
366    if (type.length == 1) {
367       return a;
368    }
369 
370    assert(!bld->type.norm);
371 
372    /*
373     * for byte vectors can do much better with psadbw.
374     * Using repeated shuffle/adds here. Note with multiple vectors
375     * this can be done more efficiently as outlined in the intel
376     * optimization manual.
377     * Note: could cause data rearrangement if used with smaller element
378     * sizes.
379     */
380 
381    vecres = a;
382    length = type.length / 2;
383    while (length > 1) {
384       LLVMValueRef vec1, vec2;
385       for (i = 0; i < length; i++) {
386          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
387          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
388       }
389       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
390                                     LLVMConstVector(shuffles1, length), "");
391       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
392                                     LLVMConstVector(shuffles2, length), "");
393       if (type.floating) {
394          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
395       }
396       else {
397          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
398       }
399       length = length >> 1;
400    }
401 
402    /* always have vector of size 2 here */
403    assert(length == 1);
404 
405    index = lp_build_const_int32(bld->gallivm, 0);
406    res = LLVMBuildExtractElement(builder, vecres, index, "");
407    index = lp_build_const_int32(bld->gallivm, 1);
408    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
409 
410    if (type.floating)
411       res = LLVMBuildFAdd(builder, res, elem2, "");
412     else
413       res = LLVMBuildAdd(builder, res, elem2, "");
414 
415    return res;
416 }
417 
418 /**
419  * Return the horizontal sums of 4 float vectors as a float4 vector.
420  * This uses the technique as outlined in Intel Optimization Manual.
421  */
422 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])423 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
424                             LLVMValueRef src[4])
425 {
426    struct gallivm_state *gallivm = bld->gallivm;
427    LLVMBuilderRef builder = gallivm->builder;
428    LLVMValueRef shuffles[4];
429    LLVMValueRef tmp[4];
430    LLVMValueRef sumtmp[2], shuftmp[2];
431 
432    /* lower half of regs */
433    shuffles[0] = lp_build_const_int32(gallivm, 0);
434    shuffles[1] = lp_build_const_int32(gallivm, 1);
435    shuffles[2] = lp_build_const_int32(gallivm, 4);
436    shuffles[3] = lp_build_const_int32(gallivm, 5);
437    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
438                                    LLVMConstVector(shuffles, 4), "");
439    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
440                                    LLVMConstVector(shuffles, 4), "");
441 
442    /* upper half of regs */
443    shuffles[0] = lp_build_const_int32(gallivm, 2);
444    shuffles[1] = lp_build_const_int32(gallivm, 3);
445    shuffles[2] = lp_build_const_int32(gallivm, 6);
446    shuffles[3] = lp_build_const_int32(gallivm, 7);
447    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
448                                    LLVMConstVector(shuffles, 4), "");
449    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
450                                    LLVMConstVector(shuffles, 4), "");
451 
452    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
453    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
454 
455    shuffles[0] = lp_build_const_int32(gallivm, 0);
456    shuffles[1] = lp_build_const_int32(gallivm, 2);
457    shuffles[2] = lp_build_const_int32(gallivm, 4);
458    shuffles[3] = lp_build_const_int32(gallivm, 6);
459    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
460                                        LLVMConstVector(shuffles, 4), "");
461 
462    shuffles[0] = lp_build_const_int32(gallivm, 1);
463    shuffles[1] = lp_build_const_int32(gallivm, 3);
464    shuffles[2] = lp_build_const_int32(gallivm, 5);
465    shuffles[3] = lp_build_const_int32(gallivm, 7);
466    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
467                                        LLVMConstVector(shuffles, 4), "");
468 
469    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
470 }
471 
472 
473 /*
474  * partially horizontally add 2-4 float vectors with length nx4,
475  * i.e. only four adjacent values in each vector will be added,
476  * assuming values are really grouped in 4 which also determines
477  * output order.
478  *
479  * Return a vector of the same length as the initial vectors,
480  * with the excess elements (if any) being undefined.
481  * The element order is independent of number of input vectors.
482  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
483  * the output order thus will be
484  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
485  */
486 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)487 lp_build_hadd_partial4(struct lp_build_context *bld,
488                        LLVMValueRef vectors[],
489                        unsigned num_vecs)
490 {
491    struct gallivm_state *gallivm = bld->gallivm;
492    LLVMBuilderRef builder = gallivm->builder;
493    LLVMValueRef ret_vec;
494    LLVMValueRef tmp[4];
495    const char *intrinsic = NULL;
496 
497    assert(num_vecs >= 2 && num_vecs <= 4);
498    assert(bld->type.floating);
499 
500    /* only use this with at least 2 vectors, as it is sort of expensive
501     * (depending on cpu) and we always need two horizontal adds anyway,
502     * so a shuffle/add approach might be better.
503     */
504 
505    tmp[0] = vectors[0];
506    tmp[1] = vectors[1];
507 
508    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
509    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
510 
511    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
512        bld->type.length == 4) {
513       intrinsic = "llvm.x86.sse3.hadd.ps";
514    }
515    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
516             bld->type.length == 8) {
517       intrinsic = "llvm.x86.avx.hadd.ps.256";
518    }
519    if (intrinsic) {
520       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
521                                        lp_build_vec_type(gallivm, bld->type),
522                                        tmp[0], tmp[1]);
523       if (num_vecs > 2) {
524          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
525                                           lp_build_vec_type(gallivm, bld->type),
526                                           tmp[2], tmp[3]);
527       }
528       else {
529          tmp[1] = tmp[0];
530       }
531       return lp_build_intrinsic_binary(builder, intrinsic,
532                                        lp_build_vec_type(gallivm, bld->type),
533                                        tmp[0], tmp[1]);
534    }
535 
536    if (bld->type.length == 4) {
537       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
538    }
539    else {
540       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
541       unsigned j;
542       unsigned num_iter = bld->type.length / 4;
543       struct lp_type parttype = bld->type;
544       parttype.length = 4;
545       for (j = 0; j < num_iter; j++) {
546          LLVMValueRef partsrc[4];
547          unsigned i;
548          for (i = 0; i < 4; i++) {
549             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
550          }
551          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
552       }
553       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
554    }
555    return ret_vec;
556 }
557 
558 /**
559  * Generate a - b
560  */
561 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)562 lp_build_sub(struct lp_build_context *bld,
563              LLVMValueRef a,
564              LLVMValueRef b)
565 {
566    LLVMBuilderRef builder = bld->gallivm->builder;
567    const struct lp_type type = bld->type;
568    LLVMValueRef res;
569 
570    assert(lp_check_value(type, a));
571    assert(lp_check_value(type, b));
572 
573    if(b == bld->zero)
574       return a;
575    if(a == bld->undef || b == bld->undef)
576       return bld->undef;
577    if(a == b)
578       return bld->zero;
579 
580    if(bld->type.norm) {
581       const char *intrinsic = NULL;
582 
583       if(b == bld->one)
584         return bld->zero;
585 
586       if(util_cpu_caps.has_sse2 &&
587          type.width * type.length == 128 &&
588          !type.floating && !type.fixed) {
589          if(type.width == 8)
590             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
591          if(type.width == 16)
592             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
593       }
594 
595       if(intrinsic)
596          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
597    }
598 
599    if(LLVMIsConstant(a) && LLVMIsConstant(b))
600       if (type.floating)
601          res = LLVMConstFSub(a, b);
602       else
603          res = LLVMConstSub(a, b);
604    else
605       if (type.floating)
606          res = LLVMBuildFSub(builder, a, b, "");
607       else
608          res = LLVMBuildSub(builder, a, b, "");
609 
610    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
611       res = lp_build_max_simple(bld, res, bld->zero);
612 
613    return res;
614 }
615 
616 
617 /**
618  * Normalized 8bit multiplication.
619  *
620  * - alpha plus one
621  *
622  *     makes the following approximation to the division (Sree)
623  *
624  *       a*b/255 ~= (a*(b + 1)) >> 256
625  *
626  *     which is the fastest method that satisfies the following OpenGL criteria
627  *
628  *       0*0 = 0 and 255*255 = 255
629  *
630  * - geometric series
631  *
632  *     takes the geometric series approximation to the division
633  *
634  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
635  *
636  *     in this case just the first two terms to fit in 16bit arithmetic
637  *
638  *       t/255 ~= (t + (t >> 8)) >> 8
639  *
640  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
641  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
642  *     must be used
643  *
644  * - geometric series plus rounding
645  *
646  *     when using a geometric series division instead of truncating the result
647  *     use roundoff in the approximation (Jim Blinn)
648  *
649  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
650  *
651  *     achieving the exact results
652  *
653  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
654  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
655  * @sa Michael Herf, The "double blend trick", May 2000,
656  *     http://www.stereopsis.com/doubleblend.html
657  */
658 static LLVMValueRef
lp_build_mul_u8n(struct gallivm_state * gallivm,struct lp_type i16_type,LLVMValueRef a,LLVMValueRef b)659 lp_build_mul_u8n(struct gallivm_state *gallivm,
660                  struct lp_type i16_type,
661                  LLVMValueRef a, LLVMValueRef b)
662 {
663    LLVMBuilderRef builder = gallivm->builder;
664    LLVMValueRef c8;
665    LLVMValueRef ab;
666 
667    assert(!i16_type.floating);
668    assert(lp_check_value(i16_type, a));
669    assert(lp_check_value(i16_type, b));
670 
671    c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
672 
673 #if 0
674 
675    /* a*b/255 ~= (a*(b + 1)) >> 256 */
676    b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
677    ab = LLVMBuildMul(builder, a, b, "");
678 
679 #else
680 
681    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
682    ab = LLVMBuildMul(builder, a, b, "");
683    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
684    ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
685 
686 #endif
687 
688    ab = LLVMBuildLShr(builder, ab, c8, "");
689 
690    return ab;
691 }
692 
693 
694 /**
695  * Generate a * b
696  */
697 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)698 lp_build_mul(struct lp_build_context *bld,
699              LLVMValueRef a,
700              LLVMValueRef b)
701 {
702    LLVMBuilderRef builder = bld->gallivm->builder;
703    const struct lp_type type = bld->type;
704    LLVMValueRef shift;
705    LLVMValueRef res;
706 
707    assert(lp_check_value(type, a));
708    assert(lp_check_value(type, b));
709 
710    if(a == bld->zero)
711       return bld->zero;
712    if(a == bld->one)
713       return b;
714    if(b == bld->zero)
715       return bld->zero;
716    if(b == bld->one)
717       return a;
718    if(a == bld->undef || b == bld->undef)
719       return bld->undef;
720 
721    if(!type.floating && !type.fixed && type.norm) {
722       if(type.width == 8) {
723          struct lp_type i16_type = lp_wider_type(type);
724          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
725 
726          lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
727          lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
728 
729          /* PMULLW, PSRLW, PADDW */
730          abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
731          abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
732 
733          ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
734 
735          return ab;
736       }
737 
738       /* FIXME */
739       assert(0);
740    }
741 
742    if(type.fixed)
743       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
744    else
745       shift = NULL;
746 
747    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
748       if (type.floating)
749          res = LLVMConstFMul(a, b);
750       else
751          res = LLVMConstMul(a, b);
752       if(shift) {
753          if(type.sign)
754             res = LLVMConstAShr(res, shift);
755          else
756             res = LLVMConstLShr(res, shift);
757       }
758    }
759    else {
760       if (type.floating)
761          res = LLVMBuildFMul(builder, a, b, "");
762       else
763          res = LLVMBuildMul(builder, a, b, "");
764       if(shift) {
765          if(type.sign)
766             res = LLVMBuildAShr(builder, res, shift, "");
767          else
768             res = LLVMBuildLShr(builder, res, shift, "");
769       }
770    }
771 
772    return res;
773 }
774 
775 
776 /**
777  * Small vector x scale multiplication optimization.
778  */
779 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)780 lp_build_mul_imm(struct lp_build_context *bld,
781                  LLVMValueRef a,
782                  int b)
783 {
784    LLVMBuilderRef builder = bld->gallivm->builder;
785    LLVMValueRef factor;
786 
787    assert(lp_check_value(bld->type, a));
788 
789    if(b == 0)
790       return bld->zero;
791 
792    if(b == 1)
793       return a;
794 
795    if(b == -1)
796       return lp_build_negate(bld, a);
797 
798    if(b == 2 && bld->type.floating)
799       return lp_build_add(bld, a, a);
800 
801    if(util_is_power_of_two(b)) {
802       unsigned shift = ffs(b) - 1;
803 
804       if(bld->type.floating) {
805 #if 0
806          /*
807           * Power of two multiplication by directly manipulating the exponent.
808           *
809           * XXX: This might not be always faster, it will introduce a small error
810           * for multiplication by zero, and it will produce wrong results
811           * for Inf and NaN.
812           */
813          unsigned mantissa = lp_mantissa(bld->type);
814          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
815          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
816          a = LLVMBuildAdd(builder, a, factor, "");
817          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
818          return a;
819 #endif
820       }
821       else {
822          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
823          return LLVMBuildShl(builder, a, factor, "");
824       }
825    }
826 
827    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
828    return lp_build_mul(bld, a, factor);
829 }
830 
831 
832 /**
833  * Generate a / b
834  */
835 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)836 lp_build_div(struct lp_build_context *bld,
837              LLVMValueRef a,
838              LLVMValueRef b)
839 {
840    LLVMBuilderRef builder = bld->gallivm->builder;
841    const struct lp_type type = bld->type;
842 
843    assert(lp_check_value(type, a));
844    assert(lp_check_value(type, b));
845 
846    if(a == bld->zero)
847       return bld->zero;
848    if(a == bld->one)
849       return lp_build_rcp(bld, b);
850    if(b == bld->zero)
851       return bld->undef;
852    if(b == bld->one)
853       return a;
854    if(a == bld->undef || b == bld->undef)
855       return bld->undef;
856 
857    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
858       if (type.floating)
859          return LLVMConstFDiv(a, b);
860       else if (type.sign)
861          return LLVMConstSDiv(a, b);
862       else
863          return LLVMConstUDiv(a, b);
864    }
865 
866    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
867        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
868       type.floating)
869       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
870 
871    if (type.floating)
872       return LLVMBuildFDiv(builder, a, b, "");
873    else if (type.sign)
874       return LLVMBuildSDiv(builder, a, b, "");
875    else
876       return LLVMBuildUDiv(builder, a, b, "");
877 }
878 
879 
880 /**
881  * Linear interpolation -- without any checks.
882  *
883  * @sa http://www.stereopsis.com/doubleblend.html
884  */
885 static INLINE LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1)886 lp_build_lerp_simple(struct lp_build_context *bld,
887                      LLVMValueRef x,
888                      LLVMValueRef v0,
889                      LLVMValueRef v1)
890 {
891    LLVMBuilderRef builder = bld->gallivm->builder;
892    LLVMValueRef delta;
893    LLVMValueRef res;
894 
895    assert(lp_check_value(bld->type, x));
896    assert(lp_check_value(bld->type, v0));
897    assert(lp_check_value(bld->type, v1));
898 
899    delta = lp_build_sub(bld, v1, v0);
900 
901    res = lp_build_mul(bld, x, delta);
902 
903    res = lp_build_add(bld, v0, res);
904 
905    if (bld->type.fixed) {
906       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
907        * but it will be wrong for other uses. Basically we need a more
908        * powerful lp_type, capable of further distinguishing the values
909        * interpretation from the value storage. */
910       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
911    }
912 
913    return res;
914 }
915 
916 
917 /**
918  * Linear interpolation.
919  */
920 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1)921 lp_build_lerp(struct lp_build_context *bld,
922               LLVMValueRef x,
923               LLVMValueRef v0,
924               LLVMValueRef v1)
925 {
926    LLVMBuilderRef builder = bld->gallivm->builder;
927    const struct lp_type type = bld->type;
928    LLVMValueRef res;
929 
930    assert(lp_check_value(type, x));
931    assert(lp_check_value(type, v0));
932    assert(lp_check_value(type, v1));
933 
934    if (type.norm) {
935       struct lp_type wide_type;
936       struct lp_build_context wide_bld;
937       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
938       LLVMValueRef shift;
939 
940       assert(type.length >= 2);
941       assert(!type.sign);
942 
943       /*
944        * Create a wider type, enough to hold the intermediate result of the
945        * multiplication.
946        */
947       memset(&wide_type, 0, sizeof wide_type);
948       wide_type.fixed  = TRUE;
949       wide_type.width  = type.width*2;
950       wide_type.length = type.length/2;
951 
952       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
953 
954       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
955       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
956       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
957 
958       /*
959        * Scale x from [0, 255] to [0, 256]
960        */
961 
962       shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
963 
964       xl = lp_build_add(&wide_bld, xl,
965                         LLVMBuildAShr(builder, xl, shift, ""));
966       xh = lp_build_add(&wide_bld, xh,
967                         LLVMBuildAShr(builder, xh, shift, ""));
968 
969       /*
970        * Lerp both halves.
971        */
972 
973       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
974       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
975 
976       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
977    } else {
978       res = lp_build_lerp_simple(bld, x, v0, v1);
979    }
980 
981    return res;
982 }
983 
984 
985 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11)986 lp_build_lerp_2d(struct lp_build_context *bld,
987                  LLVMValueRef x,
988                  LLVMValueRef y,
989                  LLVMValueRef v00,
990                  LLVMValueRef v01,
991                  LLVMValueRef v10,
992                  LLVMValueRef v11)
993 {
994    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
995    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
996    return lp_build_lerp(bld, y, v0, v1);
997 }
998 
999 
1000 /**
1001  * Generate min(a, b)
1002  * Do checks for special cases.
1003  */
1004 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1005 lp_build_min(struct lp_build_context *bld,
1006              LLVMValueRef a,
1007              LLVMValueRef b)
1008 {
1009    assert(lp_check_value(bld->type, a));
1010    assert(lp_check_value(bld->type, b));
1011 
1012    if(a == bld->undef || b == bld->undef)
1013       return bld->undef;
1014 
1015    if(a == b)
1016       return a;
1017 
1018    if (bld->type.norm) {
1019       if (!bld->type.sign) {
1020          if (a == bld->zero || b == bld->zero) {
1021             return bld->zero;
1022          }
1023       }
1024       if(a == bld->one)
1025          return b;
1026       if(b == bld->one)
1027          return a;
1028    }
1029 
1030    return lp_build_min_simple(bld, a, b);
1031 }
1032 
1033 
1034 /**
1035  * Generate max(a, b)
1036  * Do checks for special cases.
1037  */
1038 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1039 lp_build_max(struct lp_build_context *bld,
1040              LLVMValueRef a,
1041              LLVMValueRef b)
1042 {
1043    assert(lp_check_value(bld->type, a));
1044    assert(lp_check_value(bld->type, b));
1045 
1046    if(a == bld->undef || b == bld->undef)
1047       return bld->undef;
1048 
1049    if(a == b)
1050       return a;
1051 
1052    if(bld->type.norm) {
1053       if(a == bld->one || b == bld->one)
1054          return bld->one;
1055       if (!bld->type.sign) {
1056          if (a == bld->zero) {
1057             return b;
1058          }
1059          if (b == bld->zero) {
1060             return a;
1061          }
1062       }
1063    }
1064 
1065    return lp_build_max_simple(bld, a, b);
1066 }
1067 
1068 
1069 /**
1070  * Generate clamp(a, min, max)
1071  * Do checks for special cases.
1072  */
1073 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1074 lp_build_clamp(struct lp_build_context *bld,
1075                LLVMValueRef a,
1076                LLVMValueRef min,
1077                LLVMValueRef max)
1078 {
1079    assert(lp_check_value(bld->type, a));
1080    assert(lp_check_value(bld->type, min));
1081    assert(lp_check_value(bld->type, max));
1082 
1083    a = lp_build_min(bld, a, max);
1084    a = lp_build_max(bld, a, min);
1085    return a;
1086 }
1087 
1088 
1089 /**
1090  * Generate abs(a)
1091  */
1092 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1093 lp_build_abs(struct lp_build_context *bld,
1094              LLVMValueRef a)
1095 {
1096    LLVMBuilderRef builder = bld->gallivm->builder;
1097    const struct lp_type type = bld->type;
1098    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1099 
1100    assert(lp_check_value(type, a));
1101 
1102    if(!type.sign)
1103       return a;
1104 
1105    if(type.floating) {
1106       /* Mask out the sign bit */
1107       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1108       unsigned long long absMask = ~(1ULL << (type.width - 1));
1109       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1110       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1111       a = LLVMBuildAnd(builder, a, mask, "");
1112       a = LLVMBuildBitCast(builder, a, vec_type, "");
1113       return a;
1114    }
1115 
1116    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1117       switch(type.width) {
1118       case 8:
1119          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1120       case 16:
1121          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1122       case 32:
1123          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1124       }
1125    }
1126    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1127             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1128             (type.width == 8 || type.width == 16 || type.width == 32)) {
1129       debug_printf("%s: inefficient code, should split vectors manually\n",
1130                    __FUNCTION__);
1131    }
1132 
1133    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1134 }
1135 
1136 
1137 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1138 lp_build_negate(struct lp_build_context *bld,
1139                 LLVMValueRef a)
1140 {
1141    LLVMBuilderRef builder = bld->gallivm->builder;
1142 
1143    assert(lp_check_value(bld->type, a));
1144 
1145 #if HAVE_LLVM >= 0x0207
1146    if (bld->type.floating)
1147       a = LLVMBuildFNeg(builder, a, "");
1148    else
1149 #endif
1150       a = LLVMBuildNeg(builder, a, "");
1151 
1152    return a;
1153 }
1154 
1155 
1156 /** Return -1, 0 or +1 depending on the sign of a */
1157 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1158 lp_build_sgn(struct lp_build_context *bld,
1159              LLVMValueRef a)
1160 {
1161    LLVMBuilderRef builder = bld->gallivm->builder;
1162    const struct lp_type type = bld->type;
1163    LLVMValueRef cond;
1164    LLVMValueRef res;
1165 
1166    assert(lp_check_value(type, a));
1167 
1168    /* Handle non-zero case */
1169    if(!type.sign) {
1170       /* if not zero then sign must be positive */
1171       res = bld->one;
1172    }
1173    else if(type.floating) {
1174       LLVMTypeRef vec_type;
1175       LLVMTypeRef int_type;
1176       LLVMValueRef mask;
1177       LLVMValueRef sign;
1178       LLVMValueRef one;
1179       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1180 
1181       int_type = lp_build_int_vec_type(bld->gallivm, type);
1182       vec_type = lp_build_vec_type(bld->gallivm, type);
1183       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1184 
1185       /* Take the sign bit and add it to 1 constant */
1186       sign = LLVMBuildBitCast(builder, a, int_type, "");
1187       sign = LLVMBuildAnd(builder, sign, mask, "");
1188       one = LLVMConstBitCast(bld->one, int_type);
1189       res = LLVMBuildOr(builder, sign, one, "");
1190       res = LLVMBuildBitCast(builder, res, vec_type, "");
1191    }
1192    else
1193    {
1194       /* signed int/norm/fixed point */
1195       /* could use psign with sse3 and appropriate vectors here */
1196       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1197       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1198       res = lp_build_select(bld, cond, bld->one, minus_one);
1199    }
1200 
1201    /* Handle zero */
1202    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1203    res = lp_build_select(bld, cond, bld->zero, res);
1204 
1205    return res;
1206 }
1207 
1208 
1209 /**
1210  * Set the sign of float vector 'a' according to 'sign'.
1211  * If sign==0, return abs(a).
1212  * If sign==1, return -abs(a);
1213  * Other values for sign produce undefined results.
1214  */
1215 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1216 lp_build_set_sign(struct lp_build_context *bld,
1217                   LLVMValueRef a, LLVMValueRef sign)
1218 {
1219    LLVMBuilderRef builder = bld->gallivm->builder;
1220    const struct lp_type type = bld->type;
1221    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1222    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1223    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1224    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1225                              ~((unsigned long long) 1 << (type.width - 1)));
1226    LLVMValueRef val, res;
1227 
1228    assert(type.floating);
1229    assert(lp_check_value(type, a));
1230 
1231    /* val = reinterpret_cast<int>(a) */
1232    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1233    /* val = val & mask */
1234    val = LLVMBuildAnd(builder, val, mask, "");
1235    /* sign = sign << shift */
1236    sign = LLVMBuildShl(builder, sign, shift, "");
1237    /* res = val | sign */
1238    res = LLVMBuildOr(builder, val, sign, "");
1239    /* res = reinterpret_cast<float>(res) */
1240    res = LLVMBuildBitCast(builder, res, vec_type, "");
1241 
1242    return res;
1243 }
1244 
1245 
1246 /**
1247  * Convert vector of (or scalar) int to vector of (or scalar) float.
1248  */
1249 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1250 lp_build_int_to_float(struct lp_build_context *bld,
1251                       LLVMValueRef a)
1252 {
1253    LLVMBuilderRef builder = bld->gallivm->builder;
1254    const struct lp_type type = bld->type;
1255    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1256 
1257    assert(type.floating);
1258 
1259    return LLVMBuildSIToFP(builder, a, vec_type, "");
1260 }
1261 
1262 static boolean
sse41_rounding_available(const struct lp_type type)1263 sse41_rounding_available(const struct lp_type type)
1264 {
1265    if ((util_cpu_caps.has_sse4_1 &&
1266        (type.length == 1 || type.width*type.length == 128)) ||
1267        (util_cpu_caps.has_avx && type.width*type.length == 256))
1268       return TRUE;
1269 
1270    return FALSE;
1271 }
1272 
1273 enum lp_build_round_sse41_mode
1274 {
1275    LP_BUILD_ROUND_SSE41_NEAREST = 0,
1276    LP_BUILD_ROUND_SSE41_FLOOR = 1,
1277    LP_BUILD_ROUND_SSE41_CEIL = 2,
1278    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1279 };
1280 
1281 
1282 /**
1283  * Helper for SSE4.1's ROUNDxx instructions.
1284  *
1285  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1286  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1287  */
1288 static INLINE LLVMValueRef
lp_build_round_sse41(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_sse41_mode mode)1289 lp_build_round_sse41(struct lp_build_context *bld,
1290                      LLVMValueRef a,
1291                      enum lp_build_round_sse41_mode mode)
1292 {
1293    LLVMBuilderRef builder = bld->gallivm->builder;
1294    const struct lp_type type = bld->type;
1295    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1296    const char *intrinsic;
1297    LLVMValueRef res;
1298 
1299    assert(type.floating);
1300 
1301    assert(lp_check_value(type, a));
1302    assert(util_cpu_caps.has_sse4_1);
1303 
1304    if (type.length == 1) {
1305       LLVMTypeRef vec_type;
1306       LLVMValueRef undef;
1307       LLVMValueRef args[3];
1308       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1309 
1310       switch(type.width) {
1311       case 32:
1312          intrinsic = "llvm.x86.sse41.round.ss";
1313          break;
1314       case 64:
1315          intrinsic = "llvm.x86.sse41.round.sd";
1316          break;
1317       default:
1318          assert(0);
1319          return bld->undef;
1320       }
1321 
1322       vec_type = LLVMVectorType(bld->elem_type, 4);
1323 
1324       undef = LLVMGetUndef(vec_type);
1325 
1326       args[0] = undef;
1327       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1328       args[2] = LLVMConstInt(i32t, mode, 0);
1329 
1330       res = lp_build_intrinsic(builder, intrinsic,
1331                                vec_type, args, Elements(args));
1332 
1333       res = LLVMBuildExtractElement(builder, res, index0, "");
1334    }
1335    else {
1336       if (type.width * type.length == 128) {
1337          switch(type.width) {
1338          case 32:
1339             intrinsic = "llvm.x86.sse41.round.ps";
1340             break;
1341          case 64:
1342             intrinsic = "llvm.x86.sse41.round.pd";
1343             break;
1344          default:
1345             assert(0);
1346             return bld->undef;
1347          }
1348       }
1349       else {
1350          assert(type.width * type.length == 256);
1351          assert(util_cpu_caps.has_avx);
1352 
1353          switch(type.width) {
1354          case 32:
1355             intrinsic = "llvm.x86.avx.round.ps.256";
1356             break;
1357          case 64:
1358             intrinsic = "llvm.x86.avx.round.pd.256";
1359             break;
1360          default:
1361             assert(0);
1362             return bld->undef;
1363          }
1364       }
1365 
1366       res = lp_build_intrinsic_binary(builder, intrinsic,
1367                                       bld->vec_type, a,
1368                                       LLVMConstInt(i32t, mode, 0));
1369    }
1370 
1371    return res;
1372 }
1373 
1374 
1375 static INLINE LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1376 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1377                              LLVMValueRef a)
1378 {
1379    LLVMBuilderRef builder = bld->gallivm->builder;
1380    const struct lp_type type = bld->type;
1381    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1382    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1383    const char *intrinsic;
1384    LLVMValueRef res;
1385 
1386    assert(type.floating);
1387    /* using the double precision conversions is a bit more complicated */
1388    assert(type.width == 32);
1389 
1390    assert(lp_check_value(type, a));
1391    assert(util_cpu_caps.has_sse2);
1392 
1393    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1394    if (type.length == 1) {
1395       LLVMTypeRef vec_type;
1396       LLVMValueRef undef;
1397       LLVMValueRef arg;
1398       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1399 
1400       vec_type = LLVMVectorType(bld->elem_type, 4);
1401 
1402       intrinsic = "llvm.x86.sse.cvtss2si";
1403 
1404       undef = LLVMGetUndef(vec_type);
1405 
1406       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1407 
1408       res = lp_build_intrinsic_unary(builder, intrinsic,
1409                                      ret_type, arg);
1410    }
1411    else {
1412       if (type.width* type.length == 128) {
1413          intrinsic = "llvm.x86.sse2.cvtps2dq";
1414       }
1415       else {
1416          assert(type.width*type.length == 256);
1417          assert(util_cpu_caps.has_avx);
1418 
1419          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1420       }
1421       res = lp_build_intrinsic_unary(builder, intrinsic,
1422                                      ret_type, a);
1423    }
1424 
1425    return res;
1426 }
1427 
1428 
1429 /**
1430  * Return the integer part of a float (vector) value (== round toward zero).
1431  * The returned value is a float (vector).
1432  * Ex: trunc(-1.5) = -1.0
1433  */
1434 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)1435 lp_build_trunc(struct lp_build_context *bld,
1436                LLVMValueRef a)
1437 {
1438    LLVMBuilderRef builder = bld->gallivm->builder;
1439    const struct lp_type type = bld->type;
1440 
1441    assert(type.floating);
1442    assert(lp_check_value(type, a));
1443 
1444    if (sse41_rounding_available(type)) {
1445       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1446    }
1447    else {
1448       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1449       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1450       LLVMValueRef res;
1451       res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1452       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1453       return res;
1454    }
1455 }
1456 
1457 
1458 /**
1459  * Return float (vector) rounded to nearest integer (vector).  The returned
1460  * value is a float (vector).
1461  * Ex: round(0.9) = 1.0
1462  * Ex: round(-1.5) = -2.0
1463  */
1464 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)1465 lp_build_round(struct lp_build_context *bld,
1466                LLVMValueRef a)
1467 {
1468    LLVMBuilderRef builder = bld->gallivm->builder;
1469    const struct lp_type type = bld->type;
1470 
1471    assert(type.floating);
1472    assert(lp_check_value(type, a));
1473 
1474    if (sse41_rounding_available(type)) {
1475       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1476    }
1477    else {
1478       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1479       LLVMValueRef res;
1480       res = lp_build_iround(bld, a);
1481       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1482       return res;
1483    }
1484 }
1485 
1486 
1487 /**
1488  * Return floor of float (vector), result is a float (vector)
1489  * Ex: floor(1.1) = 1.0
1490  * Ex: floor(-1.1) = -2.0
1491  */
1492 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)1493 lp_build_floor(struct lp_build_context *bld,
1494                LLVMValueRef a)
1495 {
1496    LLVMBuilderRef builder = bld->gallivm->builder;
1497    const struct lp_type type = bld->type;
1498 
1499    assert(type.floating);
1500    assert(lp_check_value(type, a));
1501 
1502    if (sse41_rounding_available(type)) {
1503       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1504    }
1505    else {
1506       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1507       LLVMValueRef res;
1508       res = lp_build_ifloor(bld, a);
1509       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1510       return res;
1511    }
1512 }
1513 
1514 
1515 /**
1516  * Return ceiling of float (vector), returning float (vector).
1517  * Ex: ceil( 1.1) = 2.0
1518  * Ex: ceil(-1.1) = -1.0
1519  */
1520 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)1521 lp_build_ceil(struct lp_build_context *bld,
1522               LLVMValueRef a)
1523 {
1524    LLVMBuilderRef builder = bld->gallivm->builder;
1525    const struct lp_type type = bld->type;
1526 
1527    assert(type.floating);
1528    assert(lp_check_value(type, a));
1529 
1530    if (sse41_rounding_available(type)) {
1531       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1532    }
1533    else {
1534       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1535       LLVMValueRef res;
1536       res = lp_build_iceil(bld, a);
1537       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1538       return res;
1539    }
1540 }
1541 
1542 
1543 /**
1544  * Return fractional part of 'a' computed as a - floor(a)
1545  * Typically used in texture coord arithmetic.
1546  */
1547 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)1548 lp_build_fract(struct lp_build_context *bld,
1549                LLVMValueRef a)
1550 {
1551    assert(bld->type.floating);
1552    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1553 }
1554 
1555 
1556 /**
1557  * Prevent returning a fractional part of 1.0 for very small negative values of
1558  * 'a' by clamping against 0.99999(9).
1559  */
1560 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)1561 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1562 {
1563    LLVMValueRef max;
1564 
1565    /* this is the largest number smaller than 1.0 representable as float */
1566    max = lp_build_const_vec(bld->gallivm, bld->type,
1567                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1568    return lp_build_min(bld, fract, max);
1569 }
1570 
1571 
1572 /**
1573  * Same as lp_build_fract, but guarantees that the result is always smaller
1574  * than one.
1575  */
1576 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)1577 lp_build_fract_safe(struct lp_build_context *bld,
1578                     LLVMValueRef a)
1579 {
1580    return clamp_fract(bld, lp_build_fract(bld, a));
1581 }
1582 
1583 
1584 /**
1585  * Return the integer part of a float (vector) value (== round toward zero).
1586  * The returned value is an integer (vector).
1587  * Ex: itrunc(-1.5) = -1
1588  */
1589 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)1590 lp_build_itrunc(struct lp_build_context *bld,
1591                 LLVMValueRef a)
1592 {
1593    LLVMBuilderRef builder = bld->gallivm->builder;
1594    const struct lp_type type = bld->type;
1595    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1596 
1597    assert(type.floating);
1598    assert(lp_check_value(type, a));
1599 
1600    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1601 }
1602 
1603 
1604 /**
1605  * Return float (vector) rounded to nearest integer (vector).  The returned
1606  * value is an integer (vector).
1607  * Ex: iround(0.9) = 1
1608  * Ex: iround(-1.5) = -2
1609  */
1610 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)1611 lp_build_iround(struct lp_build_context *bld,
1612                 LLVMValueRef a)
1613 {
1614    LLVMBuilderRef builder = bld->gallivm->builder;
1615    const struct lp_type type = bld->type;
1616    LLVMTypeRef int_vec_type = bld->int_vec_type;
1617    LLVMValueRef res;
1618 
1619    assert(type.floating);
1620 
1621    assert(lp_check_value(type, a));
1622 
1623    if ((util_cpu_caps.has_sse2 &&
1624        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1625        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1626       return lp_build_iround_nearest_sse2(bld, a);
1627    }
1628    if (sse41_rounding_available(type)) {
1629       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1630    }
1631    else {
1632       LLVMValueRef half;
1633 
1634       half = lp_build_const_vec(bld->gallivm, type, 0.5);
1635 
1636       if (type.sign) {
1637          LLVMTypeRef vec_type = bld->vec_type;
1638          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1639                                     (unsigned long long)1 << (type.width - 1));
1640          LLVMValueRef sign;
1641 
1642          /* get sign bit */
1643          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1644          sign = LLVMBuildAnd(builder, sign, mask, "");
1645 
1646          /* sign * 0.5 */
1647          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1648          half = LLVMBuildOr(builder, sign, half, "");
1649          half = LLVMBuildBitCast(builder, half, vec_type, "");
1650       }
1651 
1652       res = LLVMBuildFAdd(builder, a, half, "");
1653    }
1654 
1655    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1656 
1657    return res;
1658 }
1659 
1660 
1661 /**
1662  * Return floor of float (vector), result is an int (vector)
1663  * Ex: ifloor(1.1) = 1.0
1664  * Ex: ifloor(-1.1) = -2.0
1665  */
1666 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)1667 lp_build_ifloor(struct lp_build_context *bld,
1668                 LLVMValueRef a)
1669 {
1670    LLVMBuilderRef builder = bld->gallivm->builder;
1671    const struct lp_type type = bld->type;
1672    LLVMTypeRef int_vec_type = bld->int_vec_type;
1673    LLVMValueRef res;
1674 
1675    assert(type.floating);
1676    assert(lp_check_value(type, a));
1677 
1678    res = a;
1679    if (type.sign) {
1680       if (sse41_rounding_available(type)) {
1681          res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1682       }
1683       else {
1684          /* Take the sign bit and add it to 1 constant */
1685          LLVMTypeRef vec_type = bld->vec_type;
1686          unsigned mantissa = lp_mantissa(type);
1687          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1688                                   (unsigned long long)1 << (type.width - 1));
1689          LLVMValueRef sign;
1690          LLVMValueRef offset;
1691 
1692          /* sign = a < 0 ? ~0 : 0 */
1693          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1694          sign = LLVMBuildAnd(builder, sign, mask, "");
1695          sign = LLVMBuildAShr(builder, sign,
1696                               lp_build_const_int_vec(bld->gallivm, type,
1697                                                      type.width - 1),
1698                               "ifloor.sign");
1699 
1700          /* offset = -0.99999(9)f */
1701          offset = lp_build_const_vec(bld->gallivm, type,
1702                                      -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1703          offset = LLVMConstBitCast(offset, int_vec_type);
1704 
1705          /* offset = a < 0 ? offset : 0.0f */
1706          offset = LLVMBuildAnd(builder, offset, sign, "");
1707          offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1708 
1709          res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1710       }
1711    }
1712 
1713    /* round to nearest (toward zero) */
1714    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1715 
1716    return res;
1717 }
1718 
1719 
1720 /**
1721  * Return ceiling of float (vector), returning int (vector).
1722  * Ex: iceil( 1.1) = 2
1723  * Ex: iceil(-1.1) = -1
1724  */
1725 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)1726 lp_build_iceil(struct lp_build_context *bld,
1727                LLVMValueRef a)
1728 {
1729    LLVMBuilderRef builder = bld->gallivm->builder;
1730    const struct lp_type type = bld->type;
1731    LLVMTypeRef int_vec_type = bld->int_vec_type;
1732    LLVMValueRef res;
1733 
1734    assert(type.floating);
1735    assert(lp_check_value(type, a));
1736 
1737    if (sse41_rounding_available(type)) {
1738       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1739    }
1740    else {
1741       LLVMTypeRef vec_type = bld->vec_type;
1742       unsigned mantissa = lp_mantissa(type);
1743       LLVMValueRef offset;
1744 
1745       /* offset = 0.99999(9)f */
1746       offset = lp_build_const_vec(bld->gallivm, type,
1747                                   (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1748 
1749       if (type.sign) {
1750          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1751                                 (unsigned long long)1 << (type.width - 1));
1752          LLVMValueRef sign;
1753 
1754          /* sign = a < 0 ? 0 : ~0 */
1755          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1756          sign = LLVMBuildAnd(builder, sign, mask, "");
1757          sign = LLVMBuildAShr(builder, sign,
1758                               lp_build_const_int_vec(bld->gallivm, type,
1759                                                      type.width - 1),
1760                               "iceil.sign");
1761          sign = LLVMBuildNot(builder, sign, "iceil.not");
1762 
1763          /* offset = a < 0 ? 0.0 : offset */
1764          offset = LLVMConstBitCast(offset, int_vec_type);
1765          offset = LLVMBuildAnd(builder, offset, sign, "");
1766          offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1767       }
1768 
1769       res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1770    }
1771 
1772    /* round to nearest (toward zero) */
1773    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1774 
1775    return res;
1776 }
1777 
1778 
1779 /**
1780  * Combined ifloor() & fract().
1781  *
1782  * Preferred to calling the functions separately, as it will ensure that the
1783  * strategy (floor() vs ifloor()) that results in less redundant work is used.
1784  */
1785 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)1786 lp_build_ifloor_fract(struct lp_build_context *bld,
1787                       LLVMValueRef a,
1788                       LLVMValueRef *out_ipart,
1789                       LLVMValueRef *out_fpart)
1790 {
1791    LLVMBuilderRef builder = bld->gallivm->builder;
1792    const struct lp_type type = bld->type;
1793    LLVMValueRef ipart;
1794 
1795    assert(type.floating);
1796    assert(lp_check_value(type, a));
1797 
1798    if (sse41_rounding_available(type)) {
1799       /*
1800        * floor() is easier.
1801        */
1802 
1803       ipart = lp_build_floor(bld, a);
1804       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1805       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1806    }
1807    else {
1808       /*
1809        * ifloor() is easier.
1810        */
1811 
1812       *out_ipart = lp_build_ifloor(bld, a);
1813       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1814       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1815    }
1816 }
1817 
1818 
1819 /**
1820  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
1821  * always smaller than one.
1822  */
1823 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)1824 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
1825                            LLVMValueRef a,
1826                            LLVMValueRef *out_ipart,
1827                            LLVMValueRef *out_fpart)
1828 {
1829    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
1830    *out_fpart = clamp_fract(bld, *out_fpart);
1831 }
1832 
1833 
1834 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)1835 lp_build_sqrt(struct lp_build_context *bld,
1836               LLVMValueRef a)
1837 {
1838    LLVMBuilderRef builder = bld->gallivm->builder;
1839    const struct lp_type type = bld->type;
1840    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1841    char intrinsic[32];
1842 
1843    assert(lp_check_value(type, a));
1844 
1845    /* TODO: optimize the constant case */
1846 
1847    assert(type.floating);
1848    if (type.length == 1) {
1849       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
1850    }
1851    else {
1852       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1853    }
1854 
1855    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1856 }
1857 
1858 
1859 /**
1860  * Do one Newton-Raphson step to improve reciprocate precision:
1861  *
1862  *   x_{i+1} = x_i * (2 - a * x_i)
1863  *
1864  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1865  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1866  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1867  * halo. It would be necessary to clamp the argument to prevent this.
1868  *
1869  * See also:
1870  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1871  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1872  */
1873 static INLINE LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)1874 lp_build_rcp_refine(struct lp_build_context *bld,
1875                     LLVMValueRef a,
1876                     LLVMValueRef rcp_a)
1877 {
1878    LLVMBuilderRef builder = bld->gallivm->builder;
1879    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1880    LLVMValueRef res;
1881 
1882    res = LLVMBuildFMul(builder, a, rcp_a, "");
1883    res = LLVMBuildFSub(builder, two, res, "");
1884    res = LLVMBuildFMul(builder, rcp_a, res, "");
1885 
1886    return res;
1887 }
1888 
1889 
1890 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)1891 lp_build_rcp(struct lp_build_context *bld,
1892              LLVMValueRef a)
1893 {
1894    LLVMBuilderRef builder = bld->gallivm->builder;
1895    const struct lp_type type = bld->type;
1896 
1897    assert(lp_check_value(type, a));
1898 
1899    if(a == bld->zero)
1900       return bld->undef;
1901    if(a == bld->one)
1902       return bld->one;
1903    if(a == bld->undef)
1904       return bld->undef;
1905 
1906    assert(type.floating);
1907 
1908    if(LLVMIsConstant(a))
1909       return LLVMConstFDiv(bld->one, a);
1910 
1911    /*
1912     * We don't use RCPPS because:
1913     * - it only has 10bits of precision
1914     * - it doesn't even get the reciprocate of 1.0 exactly
1915     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1916     * - for recent processors the benefit over DIVPS is marginal, a case
1917     *   dependent
1918     *
1919     * We could still use it on certain processors if benchmarks show that the
1920     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1921     * particular uses that require less workarounds.
1922     */
1923 
1924    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1925          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
1926       const unsigned num_iterations = 0;
1927       LLVMValueRef res;
1928       unsigned i;
1929       const char *intrinsic = NULL;
1930 
1931       if (type.length == 4) {
1932          intrinsic = "llvm.x86.sse.rcp.ps";
1933       }
1934       else {
1935          intrinsic = "llvm.x86.avx.rcp.ps.256";
1936       }
1937 
1938       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1939 
1940       for (i = 0; i < num_iterations; ++i) {
1941          res = lp_build_rcp_refine(bld, a, res);
1942       }
1943 
1944       return res;
1945    }
1946 
1947    return LLVMBuildFDiv(builder, bld->one, a, "");
1948 }
1949 
1950 
1951 /**
1952  * Do one Newton-Raphson step to improve rsqrt precision:
1953  *
1954  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1955  *
1956  * See also:
1957  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1958  */
1959 static INLINE LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)1960 lp_build_rsqrt_refine(struct lp_build_context *bld,
1961                       LLVMValueRef a,
1962                       LLVMValueRef rsqrt_a)
1963 {
1964    LLVMBuilderRef builder = bld->gallivm->builder;
1965    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1966    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1967    LLVMValueRef res;
1968 
1969    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1970    res = LLVMBuildFMul(builder, a, res, "");
1971    res = LLVMBuildFSub(builder, three, res, "");
1972    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1973    res = LLVMBuildFMul(builder, half, res, "");
1974 
1975    return res;
1976 }
1977 
1978 
1979 /**
1980  * Generate 1/sqrt(a)
1981  */
1982 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)1983 lp_build_rsqrt(struct lp_build_context *bld,
1984                LLVMValueRef a)
1985 {
1986    LLVMBuilderRef builder = bld->gallivm->builder;
1987    const struct lp_type type = bld->type;
1988 
1989    assert(lp_check_value(type, a));
1990 
1991    assert(type.floating);
1992 
1993    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1994         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1995       const unsigned num_iterations = 1;
1996       LLVMValueRef res;
1997       unsigned i;
1998       const char *intrinsic = NULL;
1999 
2000       if (type.length == 4) {
2001          intrinsic = "llvm.x86.sse.rsqrt.ps";
2002       }
2003       else {
2004          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2005       }
2006 
2007       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2008 
2009 
2010       for (i = 0; i < num_iterations; ++i) {
2011          res = lp_build_rsqrt_refine(bld, a, res);
2012       }
2013 
2014       return res;
2015    }
2016 
2017    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2018 }
2019 
2020 
2021 /**
2022  * Generate sin(a) using SSE2
2023  */
2024 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)2025 lp_build_sin(struct lp_build_context *bld,
2026              LLVMValueRef a)
2027 {
2028    struct gallivm_state *gallivm = bld->gallivm;
2029    LLVMBuilderRef builder = gallivm->builder;
2030    struct lp_type int_type = lp_int_type(bld->type);
2031    LLVMBuilderRef b = builder;
2032 
2033    /*
2034     *  take the absolute value,
2035     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2036     */
2037 
2038    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2039    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2040 
2041    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2042    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2043 
2044    /*
2045     * extract the sign bit (upper one)
2046     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2047     */
2048    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2049    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2050 
2051    /*
2052     * scale by 4/Pi
2053     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2054     */
2055 
2056    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2057    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2058 
2059    /*
2060     * store the integer part of y in mm0
2061     * emm2 = _mm_cvttps_epi32(y);
2062     */
2063 
2064    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2065 
2066    /*
2067     * j=(j+1) & (~1) (see the cephes sources)
2068     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2069     */
2070 
2071    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2072    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2073    /*
2074     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2075     */
2076    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2077    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2078 
2079    /*
2080     * y = _mm_cvtepi32_ps(emm2);
2081     */
2082    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2083 
2084    /* get the swap sign flag
2085     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2086     */
2087    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2088    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2089 
2090    /*
2091     * emm2 = _mm_slli_epi32(emm0, 29);
2092     */
2093    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2094    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2095 
2096    /*
2097     * get the polynom selection mask
2098     * there is one polynom for 0 <= x <= Pi/4
2099     * and another one for Pi/4<x<=Pi/2
2100     * Both branches will be computed.
2101     *
2102     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2103     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2104     */
2105 
2106    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2107    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2108    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2109                                              int_type, PIPE_FUNC_EQUAL,
2110                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2111    /*
2112     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2113     */
2114    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2115 
2116    /*
2117     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2118     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2119     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2120     */
2121    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2122    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2123    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2124 
2125    /*
2126     * The magic pass: "Extended precision modular arithmetic"
2127     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2128     * xmm1 = _mm_mul_ps(y, xmm1);
2129     * xmm2 = _mm_mul_ps(y, xmm2);
2130     * xmm3 = _mm_mul_ps(y, xmm3);
2131     */
2132    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2133    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2134    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2135 
2136    /*
2137     * x = _mm_add_ps(x, xmm1);
2138     * x = _mm_add_ps(x, xmm2);
2139     * x = _mm_add_ps(x, xmm3);
2140     */
2141 
2142    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2143    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2144    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2145 
2146    /*
2147     * Evaluate the first polynom  (0 <= x <= Pi/4)
2148     *
2149     * z = _mm_mul_ps(x,x);
2150     */
2151    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2152 
2153    /*
2154     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2155     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2156     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2157     */
2158    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2159    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2160    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2161 
2162    /*
2163     * y = *(v4sf*)_ps_coscof_p0;
2164     * y = _mm_mul_ps(y, z);
2165     */
2166    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2167    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2168    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2169    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2170    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2171    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2172 
2173 
2174    /*
2175     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2176     * y = _mm_sub_ps(y, tmp);
2177     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2178     */
2179    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2180    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2181    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2182    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2183    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2184 
2185    /*
2186     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2187     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2188     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2189     */
2190    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2191    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2192    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2193 
2194    /*
2195     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2196     *
2197     * y2 = *(v4sf*)_ps_sincof_p0;
2198     * y2 = _mm_mul_ps(y2, z);
2199     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2200     * y2 = _mm_mul_ps(y2, z);
2201     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2202     * y2 = _mm_mul_ps(y2, z);
2203     * y2 = _mm_mul_ps(y2, x);
2204     * y2 = _mm_add_ps(y2, x);
2205     */
2206 
2207    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2208    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2209    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2210    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2211    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2212    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2213    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2214 
2215    /*
2216     * select the correct result from the two polynoms
2217     * xmm3 = poly_mask;
2218     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2219     * y = _mm_andnot_ps(xmm3, y);
2220     * y = _mm_add_ps(y,y2);
2221     */
2222    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2223    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2224    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2225    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2226    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2227    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2228    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2229 
2230    /*
2231     * update the sign
2232     * y = _mm_xor_ps(y, sign_bit);
2233     */
2234    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2235    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2236    return y_result;
2237 }
2238 
2239 
2240 /**
2241  * Generate cos(a) using SSE2
2242  */
2243 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)2244 lp_build_cos(struct lp_build_context *bld,
2245              LLVMValueRef a)
2246 {
2247    struct gallivm_state *gallivm = bld->gallivm;
2248    LLVMBuilderRef builder = gallivm->builder;
2249    struct lp_type int_type = lp_int_type(bld->type);
2250    LLVMBuilderRef b = builder;
2251 
2252    /*
2253     *  take the absolute value,
2254     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2255     */
2256 
2257    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2258    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2259 
2260    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2261    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2262 
2263    /*
2264     * scale by 4/Pi
2265     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2266     */
2267 
2268    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2269    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2270 
2271    /*
2272     * store the integer part of y in mm0
2273     * emm2 = _mm_cvttps_epi32(y);
2274     */
2275 
2276    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2277 
2278    /*
2279     * j=(j+1) & (~1) (see the cephes sources)
2280     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2281     */
2282 
2283    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2284    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2285    /*
2286     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2287     */
2288    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2289    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2290 
2291    /*
2292     * y = _mm_cvtepi32_ps(emm2);
2293     */
2294    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2295 
2296 
2297    /*
2298     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2299     */
2300    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2301    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2302 
2303 
2304    /* get the swap sign flag
2305     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2306     */
2307    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2308    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2309    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2310    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2311 
2312    /*
2313     * emm2 = _mm_slli_epi32(emm0, 29);
2314     */
2315    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2316    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2317 
2318    /*
2319     * get the polynom selection mask
2320     * there is one polynom for 0 <= x <= Pi/4
2321     * and another one for Pi/4<x<=Pi/2
2322     * Both branches will be computed.
2323     *
2324     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2325     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2326     */
2327 
2328    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2329    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2330    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2331                                              int_type, PIPE_FUNC_EQUAL,
2332    				             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2333 
2334    /*
2335     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2336     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2337     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2338     */
2339    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2340    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2341    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2342 
2343    /*
2344     * The magic pass: "Extended precision modular arithmetic"
2345     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2346     * xmm1 = _mm_mul_ps(y, xmm1);
2347     * xmm2 = _mm_mul_ps(y, xmm2);
2348     * xmm3 = _mm_mul_ps(y, xmm3);
2349     */
2350    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2351    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2352    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2353 
2354    /*
2355     * x = _mm_add_ps(x, xmm1);
2356     * x = _mm_add_ps(x, xmm2);
2357     * x = _mm_add_ps(x, xmm3);
2358     */
2359 
2360    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2361    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2362    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2363 
2364    /*
2365     * Evaluate the first polynom  (0 <= x <= Pi/4)
2366     *
2367     * z = _mm_mul_ps(x,x);
2368     */
2369    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2370 
2371    /*
2372     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2373     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2374     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2375     */
2376    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2377    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2378    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2379 
2380    /*
2381     * y = *(v4sf*)_ps_coscof_p0;
2382     * y = _mm_mul_ps(y, z);
2383     */
2384    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2385    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2386    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2387    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2388    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2389    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2390 
2391 
2392    /*
2393     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2394     * y = _mm_sub_ps(y, tmp);
2395     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2396     */
2397    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2398    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2399    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2400    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2401    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2402 
2403    /*
2404     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2405     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2406     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2407     */
2408    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2409    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2410    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2411 
2412    /*
2413     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2414     *
2415     * y2 = *(v4sf*)_ps_sincof_p0;
2416     * y2 = _mm_mul_ps(y2, z);
2417     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2418     * y2 = _mm_mul_ps(y2, z);
2419     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2420     * y2 = _mm_mul_ps(y2, z);
2421     * y2 = _mm_mul_ps(y2, x);
2422     * y2 = _mm_add_ps(y2, x);
2423     */
2424 
2425    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2426    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2427    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2428    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2429    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2430    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2431    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2432 
2433    /*
2434     * select the correct result from the two polynoms
2435     * xmm3 = poly_mask;
2436     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2437     * y = _mm_andnot_ps(xmm3, y);
2438     * y = _mm_add_ps(y,y2);
2439     */
2440    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2441    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2442    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2443    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2444    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2445    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2446 
2447    /*
2448     * update the sign
2449     * y = _mm_xor_ps(y, sign_bit);
2450     */
2451    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2452    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2453    return y_result;
2454 }
2455 
2456 
2457 /**
2458  * Generate pow(x, y)
2459  */
2460 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)2461 lp_build_pow(struct lp_build_context *bld,
2462              LLVMValueRef x,
2463              LLVMValueRef y)
2464 {
2465    /* TODO: optimize the constant case */
2466    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2467        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2468       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2469                    __FUNCTION__);
2470    }
2471 
2472    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2473 }
2474 
2475 
2476 /**
2477  * Generate exp(x)
2478  */
2479 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)2480 lp_build_exp(struct lp_build_context *bld,
2481              LLVMValueRef x)
2482 {
2483    /* log2(e) = 1/log(2) */
2484    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2485                                            1.4426950408889634);
2486 
2487    assert(lp_check_value(bld->type, x));
2488 
2489    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2490 }
2491 
2492 
2493 /**
2494  * Generate log(x)
2495  */
2496 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)2497 lp_build_log(struct lp_build_context *bld,
2498              LLVMValueRef x)
2499 {
2500    /* log(2) */
2501    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2502                                           0.69314718055994529);
2503 
2504    assert(lp_check_value(bld->type, x));
2505 
2506    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2507 }
2508 
2509 
2510 /**
2511  * Generate polynomial.
2512  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2513  */
2514 static LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)2515 lp_build_polynomial(struct lp_build_context *bld,
2516                     LLVMValueRef x,
2517                     const double *coeffs,
2518                     unsigned num_coeffs)
2519 {
2520    const struct lp_type type = bld->type;
2521    LLVMValueRef even = NULL, odd = NULL;
2522    LLVMValueRef x2;
2523    unsigned i;
2524 
2525    assert(lp_check_value(bld->type, x));
2526 
2527    /* TODO: optimize the constant case */
2528    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2529        LLVMIsConstant(x)) {
2530       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2531                    __FUNCTION__);
2532    }
2533 
2534    /*
2535     * Calculate odd and even terms seperately to decrease data dependency
2536     * Ex:
2537     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2538     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2539     */
2540    x2 = lp_build_mul(bld, x, x);
2541 
2542    for (i = num_coeffs; i--; ) {
2543       LLVMValueRef coeff;
2544 
2545       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2546 
2547       if (i % 2 == 0) {
2548          if (even)
2549             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2550          else
2551             even = coeff;
2552       } else {
2553          if (odd)
2554             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2555          else
2556             odd = coeff;
2557       }
2558    }
2559 
2560    if (odd)
2561       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2562    else if (even)
2563       return even;
2564    else
2565       return bld->undef;
2566 }
2567 
2568 
2569 /**
2570  * Minimax polynomial fit of 2**x, in range [0, 1[
2571  */
2572 const double lp_build_exp2_polynomial[] = {
2573 #if EXP_POLY_DEGREE == 5
2574    0.999999925063526176901,
2575    0.693153073200168932794,
2576    0.240153617044375388211,
2577    0.0558263180532956664775,
2578    0.00898934009049466391101,
2579    0.00187757667519147912699
2580 #elif EXP_POLY_DEGREE == 4
2581    1.00000259337069434683,
2582    0.693003834469974940458,
2583    0.24144275689150793076,
2584    0.0520114606103070150235,
2585    0.0135341679161270268764
2586 #elif EXP_POLY_DEGREE == 3
2587    0.999925218562710312959,
2588    0.695833540494823811697,
2589    0.226067155427249155588,
2590    0.0780245226406372992967
2591 #elif EXP_POLY_DEGREE == 2
2592    1.00172476321474503578,
2593    0.657636275736077639316,
2594    0.33718943461968720704
2595 #else
2596 #error
2597 #endif
2598 };
2599 
2600 
2601 void
lp_build_exp2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp2_int_part,LLVMValueRef * p_frac_part,LLVMValueRef * p_exp2)2602 lp_build_exp2_approx(struct lp_build_context *bld,
2603                      LLVMValueRef x,
2604                      LLVMValueRef *p_exp2_int_part,
2605                      LLVMValueRef *p_frac_part,
2606                      LLVMValueRef *p_exp2)
2607 {
2608    LLVMBuilderRef builder = bld->gallivm->builder;
2609    const struct lp_type type = bld->type;
2610    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2611    LLVMValueRef ipart = NULL;
2612    LLVMValueRef fpart = NULL;
2613    LLVMValueRef expipart = NULL;
2614    LLVMValueRef expfpart = NULL;
2615    LLVMValueRef res = NULL;
2616 
2617    assert(lp_check_value(bld->type, x));
2618 
2619    if(p_exp2_int_part || p_frac_part || p_exp2) {
2620       /* TODO: optimize the constant case */
2621       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2622           LLVMIsConstant(x)) {
2623          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2624                       __FUNCTION__);
2625       }
2626 
2627       assert(type.floating && type.width == 32);
2628 
2629       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2630       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2631 
2632       /* ipart = floor(x) */
2633       /* fpart = x - ipart */
2634       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2635    }
2636 
2637    if(p_exp2_int_part || p_exp2) {
2638       /* expipart = (float) (1 << ipart) */
2639       expipart = LLVMBuildAdd(builder, ipart,
2640                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
2641       expipart = LLVMBuildShl(builder, expipart,
2642                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
2643       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2644    }
2645 
2646    if(p_exp2) {
2647       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2648                                      Elements(lp_build_exp2_polynomial));
2649 
2650       res = LLVMBuildFMul(builder, expipart, expfpart, "");
2651    }
2652 
2653    if(p_exp2_int_part)
2654       *p_exp2_int_part = expipart;
2655 
2656    if(p_frac_part)
2657       *p_frac_part = fpart;
2658 
2659    if(p_exp2)
2660       *p_exp2 = res;
2661 }
2662 
2663 
2664 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)2665 lp_build_exp2(struct lp_build_context *bld,
2666               LLVMValueRef x)
2667 {
2668    LLVMValueRef res;
2669    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2670    return res;
2671 }
2672 
2673 
2674 /**
2675  * Extract the exponent of a IEEE-754 floating point value.
2676  *
2677  * Optionally apply an integer bias.
2678  *
2679  * Result is an integer value with
2680  *
2681  *   ifloor(log2(x)) + bias
2682  */
2683 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)2684 lp_build_extract_exponent(struct lp_build_context *bld,
2685                           LLVMValueRef x,
2686                           int bias)
2687 {
2688    LLVMBuilderRef builder = bld->gallivm->builder;
2689    const struct lp_type type = bld->type;
2690    unsigned mantissa = lp_mantissa(type);
2691    LLVMValueRef res;
2692 
2693    assert(type.floating);
2694 
2695    assert(lp_check_value(bld->type, x));
2696 
2697    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2698 
2699    res = LLVMBuildLShr(builder, x,
2700                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2701    res = LLVMBuildAnd(builder, res,
2702                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
2703    res = LLVMBuildSub(builder, res,
2704                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2705 
2706    return res;
2707 }
2708 
2709 
2710 /**
2711  * Extract the mantissa of the a floating.
2712  *
2713  * Result is a floating point value with
2714  *
2715  *   x / floor(log2(x))
2716  */
2717 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)2718 lp_build_extract_mantissa(struct lp_build_context *bld,
2719                           LLVMValueRef x)
2720 {
2721    LLVMBuilderRef builder = bld->gallivm->builder;
2722    const struct lp_type type = bld->type;
2723    unsigned mantissa = lp_mantissa(type);
2724    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2725                                                   (1ULL << mantissa) - 1);
2726    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2727    LLVMValueRef res;
2728 
2729    assert(lp_check_value(bld->type, x));
2730 
2731    assert(type.floating);
2732 
2733    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2734 
2735    /* res = x / 2**ipart */
2736    res = LLVMBuildAnd(builder, x, mantmask, "");
2737    res = LLVMBuildOr(builder, res, one, "");
2738    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2739 
2740    return res;
2741 }
2742 
2743 
2744 
2745 /**
2746  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
2747  * These coefficients can be generate with
2748  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2749  */
2750 const double lp_build_log2_polynomial[] = {
2751 #if LOG_POLY_DEGREE == 5
2752    2.88539008148777786488L,
2753    0.961796878841293367824L,
2754    0.577058946784739859012L,
2755    0.412914355135828735411L,
2756    0.308591899232910175289L,
2757    0.352376952300281371868L,
2758 #elif LOG_POLY_DEGREE == 4
2759    2.88539009343309178325L,
2760    0.961791550404184197881L,
2761    0.577440339438736392009L,
2762    0.403343858251329912514L,
2763    0.406718052498846252698L,
2764 #elif LOG_POLY_DEGREE == 3
2765    2.88538959748872753838L,
2766    0.961932915889597772928L,
2767    0.571118517972136195241L,
2768    0.493997535084709500285L,
2769 #else
2770 #error
2771 #endif
2772 };
2773 
2774 /**
2775  * See http://www.devmaster.net/forums/showthread.php?p=43580
2776  * http://en.wikipedia.org/wiki/Logarithm#Calculation
2777  * http://www.nezumi.demon.co.uk/consult/logx.htm
2778  */
2779 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2)2780 lp_build_log2_approx(struct lp_build_context *bld,
2781                      LLVMValueRef x,
2782                      LLVMValueRef *p_exp,
2783                      LLVMValueRef *p_floor_log2,
2784                      LLVMValueRef *p_log2)
2785 {
2786    LLVMBuilderRef builder = bld->gallivm->builder;
2787    const struct lp_type type = bld->type;
2788    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2789    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2790 
2791    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2792    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2793    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2794 
2795    LLVMValueRef i = NULL;
2796    LLVMValueRef y = NULL;
2797    LLVMValueRef z = NULL;
2798    LLVMValueRef exp = NULL;
2799    LLVMValueRef mant = NULL;
2800    LLVMValueRef logexp = NULL;
2801    LLVMValueRef logmant = NULL;
2802    LLVMValueRef res = NULL;
2803 
2804    assert(lp_check_value(bld->type, x));
2805 
2806    if(p_exp || p_floor_log2 || p_log2) {
2807       /* TODO: optimize the constant case */
2808       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2809           LLVMIsConstant(x)) {
2810          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2811                       __FUNCTION__);
2812       }
2813 
2814       assert(type.floating && type.width == 32);
2815 
2816       /*
2817        * We don't explicitly handle denormalized numbers. They will yield a
2818        * result in the neighbourhood of -127, which appears to be adequate
2819        * enough.
2820        */
2821 
2822       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2823 
2824       /* exp = (float) exponent(x) */
2825       exp = LLVMBuildAnd(builder, i, expmask, "");
2826    }
2827 
2828    if(p_floor_log2 || p_log2) {
2829       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2830       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2831       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2832    }
2833 
2834    if(p_log2) {
2835       /* mant = 1 + (float) mantissa(x) */
2836       mant = LLVMBuildAnd(builder, i, mantmask, "");
2837       mant = LLVMBuildOr(builder, mant, one, "");
2838       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2839 
2840       /* y = (mant - 1) / (mant + 1) */
2841       y = lp_build_div(bld,
2842          lp_build_sub(bld, mant, bld->one),
2843          lp_build_add(bld, mant, bld->one)
2844       );
2845 
2846       /* z = y^2 */
2847       z = lp_build_mul(bld, y, y);
2848 
2849       /* compute P(z) */
2850       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
2851                                     Elements(lp_build_log2_polynomial));
2852 
2853       /* logmant = y * P(z) */
2854       logmant = lp_build_mul(bld, y, logmant);
2855 
2856       res = lp_build_add(bld, logmant, logexp);
2857    }
2858 
2859    if(p_exp) {
2860       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2861       *p_exp = exp;
2862    }
2863 
2864    if(p_floor_log2)
2865       *p_floor_log2 = logexp;
2866 
2867    if(p_log2)
2868       *p_log2 = res;
2869 }
2870 
2871 
2872 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)2873 lp_build_log2(struct lp_build_context *bld,
2874               LLVMValueRef x)
2875 {
2876    LLVMValueRef res;
2877    lp_build_log2_approx(bld, x, NULL, NULL, &res);
2878    return res;
2879 }
2880 
2881 
2882 /**
2883  * Faster (and less accurate) log2.
2884  *
2885  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2886  *
2887  * Piece-wise linear approximation, with exact results when x is a
2888  * power of two.
2889  *
2890  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2891  */
2892 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)2893 lp_build_fast_log2(struct lp_build_context *bld,
2894                    LLVMValueRef x)
2895 {
2896    LLVMBuilderRef builder = bld->gallivm->builder;
2897    LLVMValueRef ipart;
2898    LLVMValueRef fpart;
2899 
2900    assert(lp_check_value(bld->type, x));
2901 
2902    assert(bld->type.floating);
2903 
2904    /* ipart = floor(log2(x)) - 1 */
2905    ipart = lp_build_extract_exponent(bld, x, -1);
2906    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2907 
2908    /* fpart = x / 2**ipart */
2909    fpart = lp_build_extract_mantissa(bld, x);
2910 
2911    /* ipart + fpart */
2912    return LLVMBuildFAdd(builder, ipart, fpart, "");
2913 }
2914 
2915 
2916 /**
2917  * Fast implementation of iround(log2(x)).
2918  *
2919  * Not an approximation -- it should give accurate results all the time.
2920  */
2921 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)2922 lp_build_ilog2(struct lp_build_context *bld,
2923                LLVMValueRef x)
2924 {
2925    LLVMBuilderRef builder = bld->gallivm->builder;
2926    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2927    LLVMValueRef ipart;
2928 
2929    assert(bld->type.floating);
2930 
2931    assert(lp_check_value(bld->type, x));
2932 
2933    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
2934    x = LLVMBuildFMul(builder, x, sqrt2, "");
2935 
2936    /* ipart = floor(log2(x) + 0.5)  */
2937    ipart = lp_build_extract_exponent(bld, x, 0);
2938 
2939    return ipart;
2940 }
2941 
2942 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)2943 lp_build_mod(struct lp_build_context *bld,
2944              LLVMValueRef x,
2945              LLVMValueRef y)
2946 {
2947    LLVMBuilderRef builder = bld->gallivm->builder;
2948    LLVMValueRef res;
2949    const struct lp_type type = bld->type;
2950 
2951    assert(lp_check_value(type, x));
2952    assert(lp_check_value(type, y));
2953 
2954    if (type.floating)
2955       res = LLVMBuildFRem(builder, x, y, "");
2956    else if (type.sign)
2957       res = LLVMBuildSRem(builder, x, y, "");
2958    else
2959       res = LLVMBuildURem(builder, x, y, "");
2960    return res;
2961 }
2962