1 /**************************************************************************
2  *
3  * Copyright 2009-2010 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Helper
32  *
33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
34  * notably min/max and saturated operations), and it is often necessary to
35  * resort machine-specific intrinsics directly. The functions here hide all
36  * these implementation details from the other modules.
37  *
38  * We also do simple expressions simplification here. Reasons are:
39  * - it is very easy given we have all necessary information readily available
40  * - LLVM optimization passes fail to simplify several vector expressions
41  * - We often know value constraints which the optimization passes have no way
42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
43  *
44  * @author Jose Fonseca <jfonseca@vmware.com>
45  */
46 
47 
48 #include <float.h>
49 
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54 
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65 
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69 
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73 
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77 
78 #define EXP_POLY_DEGREE 5
79 
80 #define LOG_POLY_DEGREE 4
81 
82 
83 /**
84  * Generate min(a, b)
85  * No checks for special case values of a or b = 1 or 0 are done.
86  * NaN's are handled according to the behavior specified by the
87  * nan_behavior argument.
88  */
89 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)90 lp_build_min_simple(struct lp_build_context *bld,
91                     LLVMValueRef a,
92                     LLVMValueRef b,
93                     enum gallivm_nan_behavior nan_behavior)
94 {
95    const struct lp_type type = bld->type;
96    const char *intrinsic = NULL;
97    unsigned intr_size = 0;
98    LLVMValueRef cond;
99 
100    assert(lp_check_value(type, a));
101    assert(lp_check_value(type, b));
102 
103    /* TODO: optimize the constant case */
104 
105    if (type.floating && util_cpu_caps.has_sse) {
106       if (type.width == 32) {
107          if (type.length == 1) {
108             intrinsic = "llvm.x86.sse.min.ss";
109             intr_size = 128;
110          }
111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112             intrinsic = "llvm.x86.sse.min.ps";
113             intr_size = 128;
114          }
115          else {
116             intrinsic = "llvm.x86.avx.min.ps.256";
117             intr_size = 256;
118          }
119       }
120       if (type.width == 64 && util_cpu_caps.has_sse2) {
121          if (type.length == 1) {
122             intrinsic = "llvm.x86.sse2.min.sd";
123             intr_size = 128;
124          }
125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
126             intrinsic = "llvm.x86.sse2.min.pd";
127             intr_size = 128;
128          }
129          else {
130             intrinsic = "llvm.x86.avx.min.pd.256";
131             intr_size = 256;
132          }
133       }
134    }
135    else if (type.floating && util_cpu_caps.has_altivec) {
136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139                       __FUNCTION__);
140       }
141       if (type.width == 32 && type.length == 4) {
142          intrinsic = "llvm.ppc.altivec.vminfp";
143          intr_size = 128;
144       }
145    } else if (HAVE_LLVM < 0x0309 &&
146               util_cpu_caps.has_avx2 && type.length > 4) {
147       intr_size = 256;
148       switch (type.width) {
149       case 8:
150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151          break;
152       case 16:
153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154          break;
155       case 32:
156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157          break;
158       }
159    } else if (HAVE_LLVM < 0x0309 &&
160               util_cpu_caps.has_sse2 && type.length >= 2) {
161       intr_size = 128;
162       if ((type.width == 8 || type.width == 16) &&
163           (type.width * type.length <= 64) &&
164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166                       __FUNCTION__);
167       }
168       if (type.width == 8 && !type.sign) {
169          intrinsic = "llvm.x86.sse2.pminu.b";
170       }
171       else if (type.width == 16 && type.sign) {
172          intrinsic = "llvm.x86.sse2.pmins.w";
173       }
174       if (util_cpu_caps.has_sse4_1) {
175          if (type.width == 8 && type.sign) {
176             intrinsic = "llvm.x86.sse41.pminsb";
177          }
178          if (type.width == 16 && !type.sign) {
179             intrinsic = "llvm.x86.sse41.pminuw";
180          }
181          if (type.width == 32 && !type.sign) {
182             intrinsic = "llvm.x86.sse41.pminud";
183          }
184          if (type.width == 32 && type.sign) {
185             intrinsic = "llvm.x86.sse41.pminsd";
186          }
187       }
188    } else if (util_cpu_caps.has_altivec) {
189       intr_size = 128;
190       if (type.width == 8) {
191          if (!type.sign) {
192             intrinsic = "llvm.ppc.altivec.vminub";
193          } else {
194             intrinsic = "llvm.ppc.altivec.vminsb";
195          }
196       } else if (type.width == 16) {
197          if (!type.sign) {
198             intrinsic = "llvm.ppc.altivec.vminuh";
199          } else {
200             intrinsic = "llvm.ppc.altivec.vminsh";
201          }
202       } else if (type.width == 32) {
203          if (!type.sign) {
204             intrinsic = "llvm.ppc.altivec.vminuw";
205          } else {
206             intrinsic = "llvm.ppc.altivec.vminsw";
207          }
208       }
209    }
210 
211    if (intrinsic) {
212       /* We need to handle nan's for floating point numbers. If one of the
213        * inputs is nan the other should be returned (required by both D3D10+
214        * and OpenCL).
215        * The sse intrinsics return the second operator in case of nan by
216        * default so we need to special code to handle those.
217        */
218       if (util_cpu_caps.has_sse && type.floating &&
219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222          LLVMValueRef isnan, min;
223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224                                                    type,
225                                                    intr_size, a, b);
226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227             isnan = lp_build_isnan(bld, b);
228             return lp_build_select(bld, isnan, a, min);
229          } else {
230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231             isnan = lp_build_isnan(bld, a);
232             return lp_build_select(bld, isnan, a, min);
233          }
234       } else {
235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236                                                     type,
237                                                     intr_size, a, b);
238       }
239    }
240 
241    if (type.floating) {
242       switch (nan_behavior) {
243       case GALLIVM_NAN_RETURN_NAN: {
244          LLVMValueRef isnan = lp_build_isnan(bld, b);
245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247          return lp_build_select(bld, cond, a, b);
248       }
249          break;
250       case GALLIVM_NAN_RETURN_OTHER: {
251          LLVMValueRef isnan = lp_build_isnan(bld, a);
252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254          return lp_build_select(bld, cond, a, b);
255       }
256          break;
257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259          return lp_build_select(bld, cond, a, b);
260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262          return lp_build_select(bld, cond, b, a);
263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265          return lp_build_select(bld, cond, a, b);
266          break;
267       default:
268          assert(0);
269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270          return lp_build_select(bld, cond, a, b);
271       }
272    } else {
273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274       return lp_build_select(bld, cond, a, b);
275    }
276 }
277 
278 
279 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)280 lp_build_fmuladd(LLVMBuilderRef builder,
281                  LLVMValueRef a,
282                  LLVMValueRef b,
283                  LLVMValueRef c)
284 {
285    LLVMTypeRef type = LLVMTypeOf(a);
286    assert(type == LLVMTypeOf(b));
287    assert(type == LLVMTypeOf(c));
288    if (HAVE_LLVM < 0x0304) {
289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290        * not supported, and instead it falls-back to a C function.
291        */
292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293    }
294    char intrinsic[32];
295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296    LLVMValueRef args[] = { a, b, c };
297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299 
300 
301 /**
302  * Generate max(a, b)
303  * No checks for special case values of a or b = 1 or 0 are done.
304  * NaN's are handled according to the behavior specified by the
305  * nan_behavior argument.
306  */
307 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)308 lp_build_max_simple(struct lp_build_context *bld,
309                     LLVMValueRef a,
310                     LLVMValueRef b,
311                     enum gallivm_nan_behavior nan_behavior)
312 {
313    const struct lp_type type = bld->type;
314    const char *intrinsic = NULL;
315    unsigned intr_size = 0;
316    LLVMValueRef cond;
317 
318    assert(lp_check_value(type, a));
319    assert(lp_check_value(type, b));
320 
321    /* TODO: optimize the constant case */
322 
323    if (type.floating && util_cpu_caps.has_sse) {
324       if (type.width == 32) {
325          if (type.length == 1) {
326             intrinsic = "llvm.x86.sse.max.ss";
327             intr_size = 128;
328          }
329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330             intrinsic = "llvm.x86.sse.max.ps";
331             intr_size = 128;
332          }
333          else {
334             intrinsic = "llvm.x86.avx.max.ps.256";
335             intr_size = 256;
336          }
337       }
338       if (type.width == 64 && util_cpu_caps.has_sse2) {
339          if (type.length == 1) {
340             intrinsic = "llvm.x86.sse2.max.sd";
341             intr_size = 128;
342          }
343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
344             intrinsic = "llvm.x86.sse2.max.pd";
345             intr_size = 128;
346          }
347          else {
348             intrinsic = "llvm.x86.avx.max.pd.256";
349             intr_size = 256;
350          }
351       }
352    }
353    else if (type.floating && util_cpu_caps.has_altivec) {
354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357                       __FUNCTION__);
358       }
359       if (type.width == 32 || type.length == 4) {
360          intrinsic = "llvm.ppc.altivec.vmaxfp";
361          intr_size = 128;
362       }
363    } else if (HAVE_LLVM < 0x0309 &&
364               util_cpu_caps.has_avx2 && type.length > 4) {
365       intr_size = 256;
366       switch (type.width) {
367       case 8:
368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369          break;
370       case 16:
371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372          break;
373       case 32:
374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375          break;
376       }
377    } else if (HAVE_LLVM < 0x0309 &&
378               util_cpu_caps.has_sse2 && type.length >= 2) {
379       intr_size = 128;
380       if ((type.width == 8 || type.width == 16) &&
381           (type.width * type.length <= 64) &&
382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384                       __FUNCTION__);
385          }
386       if (type.width == 8 && !type.sign) {
387          intrinsic = "llvm.x86.sse2.pmaxu.b";
388          intr_size = 128;
389       }
390       else if (type.width == 16 && type.sign) {
391          intrinsic = "llvm.x86.sse2.pmaxs.w";
392       }
393       if (util_cpu_caps.has_sse4_1) {
394          if (type.width == 8 && type.sign) {
395             intrinsic = "llvm.x86.sse41.pmaxsb";
396          }
397          if (type.width == 16 && !type.sign) {
398             intrinsic = "llvm.x86.sse41.pmaxuw";
399          }
400          if (type.width == 32 && !type.sign) {
401             intrinsic = "llvm.x86.sse41.pmaxud";
402         }
403          if (type.width == 32 && type.sign) {
404             intrinsic = "llvm.x86.sse41.pmaxsd";
405          }
406       }
407    } else if (util_cpu_caps.has_altivec) {
408      intr_size = 128;
409      if (type.width == 8) {
410        if (!type.sign) {
411          intrinsic = "llvm.ppc.altivec.vmaxub";
412        } else {
413          intrinsic = "llvm.ppc.altivec.vmaxsb";
414        }
415      } else if (type.width == 16) {
416        if (!type.sign) {
417          intrinsic = "llvm.ppc.altivec.vmaxuh";
418        } else {
419          intrinsic = "llvm.ppc.altivec.vmaxsh";
420        }
421      } else if (type.width == 32) {
422        if (!type.sign) {
423          intrinsic = "llvm.ppc.altivec.vmaxuw";
424        } else {
425          intrinsic = "llvm.ppc.altivec.vmaxsw";
426        }
427      }
428    }
429 
430    if (intrinsic) {
431       if (util_cpu_caps.has_sse && type.floating &&
432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435          LLVMValueRef isnan, max;
436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437                                                    type,
438                                                    intr_size, a, b);
439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440             isnan = lp_build_isnan(bld, b);
441             return lp_build_select(bld, isnan, a, max);
442          } else {
443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444             isnan = lp_build_isnan(bld, a);
445             return lp_build_select(bld, isnan, a, max);
446          }
447       } else {
448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449                                                     type,
450                                                     intr_size, a, b);
451       }
452    }
453 
454    if (type.floating) {
455       switch (nan_behavior) {
456       case GALLIVM_NAN_RETURN_NAN: {
457          LLVMValueRef isnan = lp_build_isnan(bld, b);
458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460          return lp_build_select(bld, cond, a, b);
461       }
462          break;
463       case GALLIVM_NAN_RETURN_OTHER: {
464          LLVMValueRef isnan = lp_build_isnan(bld, a);
465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467          return lp_build_select(bld, cond, a, b);
468       }
469          break;
470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472          return lp_build_select(bld, cond, a, b);
473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475          return lp_build_select(bld, cond, b, a);
476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478          return lp_build_select(bld, cond, a, b);
479          break;
480       default:
481          assert(0);
482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483          return lp_build_select(bld, cond, a, b);
484       }
485    } else {
486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487       return lp_build_select(bld, cond, a, b);
488    }
489 }
490 
491 
492 /**
493  * Generate 1 - a, or ~a depending on bld->type.
494  */
495 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)496 lp_build_comp(struct lp_build_context *bld,
497               LLVMValueRef a)
498 {
499    LLVMBuilderRef builder = bld->gallivm->builder;
500    const struct lp_type type = bld->type;
501 
502    assert(lp_check_value(type, a));
503 
504    if(a == bld->one)
505       return bld->zero;
506    if(a == bld->zero)
507       return bld->one;
508 
509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
510       if(LLVMIsConstant(a))
511          return LLVMConstNot(a);
512       else
513          return LLVMBuildNot(builder, a, "");
514    }
515 
516    if(LLVMIsConstant(a))
517       if (type.floating)
518           return LLVMConstFSub(bld->one, a);
519       else
520           return LLVMConstSub(bld->one, a);
521    else
522       if (type.floating)
523          return LLVMBuildFSub(builder, bld->one, a, "");
524       else
525          return LLVMBuildSub(builder, bld->one, a, "");
526 }
527 
528 
529 /**
530  * Generate a + b
531  */
532 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)533 lp_build_add(struct lp_build_context *bld,
534              LLVMValueRef a,
535              LLVMValueRef b)
536 {
537    LLVMBuilderRef builder = bld->gallivm->builder;
538    const struct lp_type type = bld->type;
539    LLVMValueRef res;
540 
541    assert(lp_check_value(type, a));
542    assert(lp_check_value(type, b));
543 
544    if (a == bld->zero)
545       return b;
546    if (b == bld->zero)
547       return a;
548    if (a == bld->undef || b == bld->undef)
549       return bld->undef;
550 
551    if (type.norm) {
552       const char *intrinsic = NULL;
553 
554       if (!type.sign && (a == bld->one || b == bld->one))
555         return bld->one;
556 
557       if (!type.floating && !type.fixed) {
558          if (type.width * type.length == 128) {
559             if (util_cpu_caps.has_sse2) {
560               if (type.width == 8)
561                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
562               if (type.width == 16)
563                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
564             } else if (util_cpu_caps.has_altivec) {
565               if (type.width == 8)
566                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
567               if (type.width == 16)
568                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
569             }
570          }
571          if (type.width * type.length == 256) {
572             if (util_cpu_caps.has_avx2) {
573               if (type.width == 8)
574                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
575               if (type.width == 16)
576                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
577             }
578          }
579       }
580 
581       if (intrinsic)
582          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
583    }
584 
585    if(type.norm && !type.floating && !type.fixed) {
586       if (type.sign) {
587          uint64_t sign = (uint64_t)1 << (type.width - 1);
588          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
589          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
590          /* a_clamp_max is the maximum a for positive b,
591             a_clamp_min is the minimum a for negative b. */
592          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
593          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
594          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
595       } else {
596          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
597       }
598    }
599 
600    if(LLVMIsConstant(a) && LLVMIsConstant(b))
601       if (type.floating)
602          res = LLVMConstFAdd(a, b);
603       else
604          res = LLVMConstAdd(a, b);
605    else
606       if (type.floating)
607          res = LLVMBuildFAdd(builder, a, b, "");
608       else
609          res = LLVMBuildAdd(builder, a, b, "");
610 
611    /* clamp to ceiling of 1.0 */
612    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
613       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
614 
615    /* XXX clamp to floor of -1 or 0??? */
616 
617    return res;
618 }
619 
620 
621 /** Return the scalar sum of the elements of a.
622  * Should avoid this operation whenever possible.
623  */
624 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)625 lp_build_horizontal_add(struct lp_build_context *bld,
626                         LLVMValueRef a)
627 {
628    LLVMBuilderRef builder = bld->gallivm->builder;
629    const struct lp_type type = bld->type;
630    LLVMValueRef index, res;
631    unsigned i, length;
632    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
633    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
634    LLVMValueRef vecres, elem2;
635 
636    assert(lp_check_value(type, a));
637 
638    if (type.length == 1) {
639       return a;
640    }
641 
642    assert(!bld->type.norm);
643 
644    /*
645     * for byte vectors can do much better with psadbw.
646     * Using repeated shuffle/adds here. Note with multiple vectors
647     * this can be done more efficiently as outlined in the intel
648     * optimization manual.
649     * Note: could cause data rearrangement if used with smaller element
650     * sizes.
651     */
652 
653    vecres = a;
654    length = type.length / 2;
655    while (length > 1) {
656       LLVMValueRef vec1, vec2;
657       for (i = 0; i < length; i++) {
658          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
659          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
660       }
661       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
662                                     LLVMConstVector(shuffles1, length), "");
663       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
664                                     LLVMConstVector(shuffles2, length), "");
665       if (type.floating) {
666          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
667       }
668       else {
669          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
670       }
671       length = length >> 1;
672    }
673 
674    /* always have vector of size 2 here */
675    assert(length == 1);
676 
677    index = lp_build_const_int32(bld->gallivm, 0);
678    res = LLVMBuildExtractElement(builder, vecres, index, "");
679    index = lp_build_const_int32(bld->gallivm, 1);
680    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
681 
682    if (type.floating)
683       res = LLVMBuildFAdd(builder, res, elem2, "");
684     else
685       res = LLVMBuildAdd(builder, res, elem2, "");
686 
687    return res;
688 }
689 
690 /**
691  * Return the horizontal sums of 4 float vectors as a float4 vector.
692  * This uses the technique as outlined in Intel Optimization Manual.
693  */
694 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
696                             LLVMValueRef src[4])
697 {
698    struct gallivm_state *gallivm = bld->gallivm;
699    LLVMBuilderRef builder = gallivm->builder;
700    LLVMValueRef shuffles[4];
701    LLVMValueRef tmp[4];
702    LLVMValueRef sumtmp[2], shuftmp[2];
703 
704    /* lower half of regs */
705    shuffles[0] = lp_build_const_int32(gallivm, 0);
706    shuffles[1] = lp_build_const_int32(gallivm, 1);
707    shuffles[2] = lp_build_const_int32(gallivm, 4);
708    shuffles[3] = lp_build_const_int32(gallivm, 5);
709    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
710                                    LLVMConstVector(shuffles, 4), "");
711    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
712                                    LLVMConstVector(shuffles, 4), "");
713 
714    /* upper half of regs */
715    shuffles[0] = lp_build_const_int32(gallivm, 2);
716    shuffles[1] = lp_build_const_int32(gallivm, 3);
717    shuffles[2] = lp_build_const_int32(gallivm, 6);
718    shuffles[3] = lp_build_const_int32(gallivm, 7);
719    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
720                                    LLVMConstVector(shuffles, 4), "");
721    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
722                                    LLVMConstVector(shuffles, 4), "");
723 
724    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
725    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
726 
727    shuffles[0] = lp_build_const_int32(gallivm, 0);
728    shuffles[1] = lp_build_const_int32(gallivm, 2);
729    shuffles[2] = lp_build_const_int32(gallivm, 4);
730    shuffles[3] = lp_build_const_int32(gallivm, 6);
731    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
732                                        LLVMConstVector(shuffles, 4), "");
733 
734    shuffles[0] = lp_build_const_int32(gallivm, 1);
735    shuffles[1] = lp_build_const_int32(gallivm, 3);
736    shuffles[2] = lp_build_const_int32(gallivm, 5);
737    shuffles[3] = lp_build_const_int32(gallivm, 7);
738    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
739                                        LLVMConstVector(shuffles, 4), "");
740 
741    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
742 }
743 
744 
745 /*
746  * partially horizontally add 2-4 float vectors with length nx4,
747  * i.e. only four adjacent values in each vector will be added,
748  * assuming values are really grouped in 4 which also determines
749  * output order.
750  *
751  * Return a vector of the same length as the initial vectors,
752  * with the excess elements (if any) being undefined.
753  * The element order is independent of number of input vectors.
754  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
755  * the output order thus will be
756  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
757  */
758 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)759 lp_build_hadd_partial4(struct lp_build_context *bld,
760                        LLVMValueRef vectors[],
761                        unsigned num_vecs)
762 {
763    struct gallivm_state *gallivm = bld->gallivm;
764    LLVMBuilderRef builder = gallivm->builder;
765    LLVMValueRef ret_vec;
766    LLVMValueRef tmp[4];
767    const char *intrinsic = NULL;
768 
769    assert(num_vecs >= 2 && num_vecs <= 4);
770    assert(bld->type.floating);
771 
772    /* only use this with at least 2 vectors, as it is sort of expensive
773     * (depending on cpu) and we always need two horizontal adds anyway,
774     * so a shuffle/add approach might be better.
775     */
776 
777    tmp[0] = vectors[0];
778    tmp[1] = vectors[1];
779 
780    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
781    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
782 
783    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
784        bld->type.length == 4) {
785       intrinsic = "llvm.x86.sse3.hadd.ps";
786    }
787    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
788             bld->type.length == 8) {
789       intrinsic = "llvm.x86.avx.hadd.ps.256";
790    }
791    if (intrinsic) {
792       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
793                                        lp_build_vec_type(gallivm, bld->type),
794                                        tmp[0], tmp[1]);
795       if (num_vecs > 2) {
796          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
797                                           lp_build_vec_type(gallivm, bld->type),
798                                           tmp[2], tmp[3]);
799       }
800       else {
801          tmp[1] = tmp[0];
802       }
803       return lp_build_intrinsic_binary(builder, intrinsic,
804                                        lp_build_vec_type(gallivm, bld->type),
805                                        tmp[0], tmp[1]);
806    }
807 
808    if (bld->type.length == 4) {
809       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
810    }
811    else {
812       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
813       unsigned j;
814       unsigned num_iter = bld->type.length / 4;
815       struct lp_type parttype = bld->type;
816       parttype.length = 4;
817       for (j = 0; j < num_iter; j++) {
818          LLVMValueRef partsrc[4];
819          unsigned i;
820          for (i = 0; i < 4; i++) {
821             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
822          }
823          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
824       }
825       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
826    }
827    return ret_vec;
828 }
829 
830 /**
831  * Generate a - b
832  */
833 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)834 lp_build_sub(struct lp_build_context *bld,
835              LLVMValueRef a,
836              LLVMValueRef b)
837 {
838    LLVMBuilderRef builder = bld->gallivm->builder;
839    const struct lp_type type = bld->type;
840    LLVMValueRef res;
841 
842    assert(lp_check_value(type, a));
843    assert(lp_check_value(type, b));
844 
845    if (b == bld->zero)
846       return a;
847    if (a == bld->undef || b == bld->undef)
848       return bld->undef;
849    if (a == b)
850       return bld->zero;
851 
852    if (type.norm) {
853       const char *intrinsic = NULL;
854 
855       if (!type.sign && b == bld->one)
856         return bld->zero;
857 
858       if (!type.floating && !type.fixed) {
859          if (type.width * type.length == 128) {
860             if (util_cpu_caps.has_sse2) {
861               if (type.width == 8)
862                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
863               if (type.width == 16)
864                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
865             } else if (util_cpu_caps.has_altivec) {
866               if (type.width == 8)
867                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
868               if (type.width == 16)
869                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
870             }
871          }
872          if (type.width * type.length == 256) {
873             if (util_cpu_caps.has_avx2) {
874               if (type.width == 8)
875                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
876               if (type.width == 16)
877                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
878             }
879          }
880       }
881 
882       if (intrinsic)
883          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
884    }
885 
886    if(type.norm && !type.floating && !type.fixed) {
887       if (type.sign) {
888          uint64_t sign = (uint64_t)1 << (type.width - 1);
889          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
890          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
891          /* a_clamp_max is the maximum a for negative b,
892             a_clamp_min is the minimum a for positive b. */
893          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
894          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
895          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
896       } else {
897          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
898       }
899    }
900 
901    if(LLVMIsConstant(a) && LLVMIsConstant(b))
902       if (type.floating)
903          res = LLVMConstFSub(a, b);
904       else
905          res = LLVMConstSub(a, b);
906    else
907       if (type.floating)
908          res = LLVMBuildFSub(builder, a, b, "");
909       else
910          res = LLVMBuildSub(builder, a, b, "");
911 
912    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
913       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
914 
915    return res;
916 }
917 
918 
919 
920 /**
921  * Normalized multiplication.
922  *
923  * There are several approaches for (using 8-bit normalized multiplication as
924  * an example):
925  *
926  * - alpha plus one
927  *
928  *     makes the following approximation to the division (Sree)
929  *
930  *       a*b/255 ~= (a*(b + 1)) >> 256
931  *
932  *     which is the fastest method that satisfies the following OpenGL criteria of
933  *
934  *       0*0 = 0 and 255*255 = 255
935  *
936  * - geometric series
937  *
938  *     takes the geometric series approximation to the division
939  *
940  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
941  *
942  *     in this case just the first two terms to fit in 16bit arithmetic
943  *
944  *       t/255 ~= (t + (t >> 8)) >> 8
945  *
946  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
947  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
948  *     must be used.
949  *
950  * - geometric series plus rounding
951  *
952  *     when using a geometric series division instead of truncating the result
953  *     use roundoff in the approximation (Jim Blinn)
954  *
955  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
956  *
957  *     achieving the exact results.
958  *
959  *
960  *
961  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
962  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
963  * @sa Michael Herf, The "double blend trick", May 2000,
964  *     http://www.stereopsis.com/doubleblend.html
965  */
966 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)967 lp_build_mul_norm(struct gallivm_state *gallivm,
968                   struct lp_type wide_type,
969                   LLVMValueRef a, LLVMValueRef b)
970 {
971    LLVMBuilderRef builder = gallivm->builder;
972    struct lp_build_context bld;
973    unsigned n;
974    LLVMValueRef half;
975    LLVMValueRef ab;
976 
977    assert(!wide_type.floating);
978    assert(lp_check_value(wide_type, a));
979    assert(lp_check_value(wide_type, b));
980 
981    lp_build_context_init(&bld, gallivm, wide_type);
982 
983    n = wide_type.width / 2;
984    if (wide_type.sign) {
985       --n;
986    }
987 
988    /*
989     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
990     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
991     */
992 
993    /*
994     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
995     */
996 
997    ab = LLVMBuildMul(builder, a, b, "");
998    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
999 
1000    /*
1001     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002     */
1003 
1004    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005    if (wide_type.sign) {
1006       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008       half = lp_build_select(&bld, sign, minus_half, half);
1009    }
1010    ab = LLVMBuildAdd(builder, ab, half, "");
1011 
1012    /* Final division */
1013    ab = lp_build_shr_imm(&bld, ab, n);
1014 
1015    return ab;
1016 }
1017 
1018 /**
1019  * Generate a * b
1020  */
1021 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1022 lp_build_mul(struct lp_build_context *bld,
1023              LLVMValueRef a,
1024              LLVMValueRef b)
1025 {
1026    LLVMBuilderRef builder = bld->gallivm->builder;
1027    const struct lp_type type = bld->type;
1028    LLVMValueRef shift;
1029    LLVMValueRef res;
1030 
1031    assert(lp_check_value(type, a));
1032    assert(lp_check_value(type, b));
1033 
1034    if(a == bld->zero)
1035       return bld->zero;
1036    if(a == bld->one)
1037       return b;
1038    if(b == bld->zero)
1039       return bld->zero;
1040    if(b == bld->one)
1041       return a;
1042    if(a == bld->undef || b == bld->undef)
1043       return bld->undef;
1044 
1045    if (!type.floating && !type.fixed && type.norm) {
1046       struct lp_type wide_type = lp_wider_type(type);
1047       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048 
1049       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1050       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1051 
1052       /* PMULLW, PSRLW, PADDW */
1053       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055 
1056       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1057 
1058       return ab;
1059    }
1060 
1061    if(type.fixed)
1062       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063    else
1064       shift = NULL;
1065 
1066    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067       if (type.floating)
1068          res = LLVMConstFMul(a, b);
1069       else
1070          res = LLVMConstMul(a, b);
1071       if(shift) {
1072          if(type.sign)
1073             res = LLVMConstAShr(res, shift);
1074          else
1075             res = LLVMConstLShr(res, shift);
1076       }
1077    }
1078    else {
1079       if (type.floating)
1080          res = LLVMBuildFMul(builder, a, b, "");
1081       else
1082          res = LLVMBuildMul(builder, a, b, "");
1083       if(shift) {
1084          if(type.sign)
1085             res = LLVMBuildAShr(builder, res, shift, "");
1086          else
1087             res = LLVMBuildLShr(builder, res, shift, "");
1088       }
1089    }
1090 
1091    return res;
1092 }
1093 
1094 /*
1095  * Widening mul, valid for 32x32 bit -> 64bit only.
1096  * Result is low 32bits, high bits returned in res_hi.
1097  *
1098  * Emits code that is meant to be compiled for the host CPU.
1099  */
1100 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1102                          LLVMValueRef a,
1103                          LLVMValueRef b,
1104                          LLVMValueRef *res_hi)
1105 {
1106    struct gallivm_state *gallivm = bld->gallivm;
1107    LLVMBuilderRef builder = gallivm->builder;
1108 
1109    assert(bld->type.width == 32);
1110    assert(bld->type.floating == 0);
1111    assert(bld->type.fixed == 0);
1112    assert(bld->type.norm == 0);
1113 
1114    /*
1115     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1116     * for x86 simd is atrocious (even if the high bits weren't required),
1117     * trying to handle real 64bit inputs (which of course can't happen due
1118     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1119     * apparently llvm does not recognize this widening mul). This includes 6
1120     * (instead of 2) pmuludq plus extra adds and shifts
1121     * The same story applies to signed mul, albeit fixing this requires sse41.
1122     * https://llvm.org/bugs/show_bug.cgi?id=30845
1123     * So, whip up our own code, albeit only for length 4 and 8 (which
1124     * should be good enough)...
1125     */
1126    if ((bld->type.length == 4 || bld->type.length == 8) &&
1127        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1128         util_cpu_caps.has_sse4_1)) {
1129       const char *intrinsic = NULL;
1130       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1131       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1132       struct lp_type type_wide = lp_wider_type(bld->type);
1133       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1134       unsigned i;
1135       for (i = 0; i < bld->type.length; i += 2) {
1136          shuf[i] = lp_build_const_int32(gallivm, i+1);
1137          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1138       }
1139       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140       aeven = a;
1141       beven = b;
1142       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1143       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1144 
1145       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1146          if (bld->type.sign) {
1147             intrinsic = "llvm.x86.avx2.pmul.dq";
1148          } else {
1149             intrinsic = "llvm.x86.avx2.pmulu.dq";
1150          }
1151          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152                                              wider_type, aeven, beven);
1153          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154                                             wider_type, aodd, bodd);
1155       }
1156       else {
1157          /* for consistent naming look elsewhere... */
1158          if (bld->type.sign) {
1159             intrinsic = "llvm.x86.sse41.pmuldq";
1160          } else {
1161             intrinsic = "llvm.x86.sse2.pmulu.dq";
1162          }
1163          /*
1164           * XXX If we only have AVX but not AVX2 this is a pain.
1165           * lp_build_intrinsic_binary_anylength() can't handle it
1166           * (due to src and dst type not being identical).
1167           */
1168          if (bld->type.length == 8) {
1169             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1170             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1171             LLVMValueRef muleven2[2], mulodd2[2];
1172             struct lp_type type_wide_half = type_wide;
1173             LLVMTypeRef wtype_half;
1174             type_wide_half.length = 2;
1175             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1176             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1177             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1178             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1179             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1180             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1181             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1182             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1183             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1184             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1185                                                     wtype_half, aevenlo, bevenlo);
1186             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1187                                                    wtype_half, aoddlo, boddlo);
1188             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1189                                                     wtype_half, aevenhi, bevenhi);
1190             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1191                                                    wtype_half, aoddhi, boddhi);
1192             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1193             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1194 
1195          }
1196          else {
1197             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1198                                                 wider_type, aeven, beven);
1199             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1200                                                wider_type, aodd, bodd);
1201          }
1202       }
1203       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1204       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1205 
1206       for (i = 0; i < bld->type.length; i += 2) {
1207          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1208          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1209       }
1210       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1211       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1212 
1213       for (i = 0; i < bld->type.length; i += 2) {
1214          shuf[i] = lp_build_const_int32(gallivm, i);
1215          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1216       }
1217       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1218       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1219    }
1220    else {
1221       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1222    }
1223 }
1224 
1225 
1226 /*
1227  * Widening mul, valid for 32x32 bit -> 64bit only.
1228  * Result is low 32bits, high bits returned in res_hi.
1229  *
1230  * Emits generic code.
1231  */
1232 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
1234                      LLVMValueRef a,
1235                      LLVMValueRef b,
1236                      LLVMValueRef *res_hi)
1237 {
1238    struct gallivm_state *gallivm = bld->gallivm;
1239    LLVMBuilderRef builder = gallivm->builder;
1240    LLVMValueRef tmp, shift, res_lo;
1241    struct lp_type type_tmp;
1242    LLVMTypeRef wide_type, narrow_type;
1243 
1244    type_tmp = bld->type;
1245    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1246    type_tmp.width *= 2;
1247    wide_type = lp_build_vec_type(gallivm, type_tmp);
1248    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1249 
1250    if (bld->type.sign) {
1251       a = LLVMBuildSExt(builder, a, wide_type, "");
1252       b = LLVMBuildSExt(builder, b, wide_type, "");
1253    } else {
1254       a = LLVMBuildZExt(builder, a, wide_type, "");
1255       b = LLVMBuildZExt(builder, b, wide_type, "");
1256    }
1257    tmp = LLVMBuildMul(builder, a, b, "");
1258 
1259    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1260 
1261    /* Since we truncate anyway, LShr and AShr are equivalent. */
1262    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1263    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1264 
1265    return res_lo;
1266 }
1267 
1268 
1269 /* a * b + c */
1270 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1271 lp_build_mad(struct lp_build_context *bld,
1272              LLVMValueRef a,
1273              LLVMValueRef b,
1274              LLVMValueRef c)
1275 {
1276    const struct lp_type type = bld->type;
1277    if (type.floating) {
1278       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1279    } else {
1280       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1281    }
1282 }
1283 
1284 
1285 /**
1286  * Small vector x scale multiplication optimization.
1287  */
1288 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1289 lp_build_mul_imm(struct lp_build_context *bld,
1290                  LLVMValueRef a,
1291                  int b)
1292 {
1293    LLVMBuilderRef builder = bld->gallivm->builder;
1294    LLVMValueRef factor;
1295 
1296    assert(lp_check_value(bld->type, a));
1297 
1298    if(b == 0)
1299       return bld->zero;
1300 
1301    if(b == 1)
1302       return a;
1303 
1304    if(b == -1)
1305       return lp_build_negate(bld, a);
1306 
1307    if(b == 2 && bld->type.floating)
1308       return lp_build_add(bld, a, a);
1309 
1310    if(util_is_power_of_two(b)) {
1311       unsigned shift = ffs(b) - 1;
1312 
1313       if(bld->type.floating) {
1314 #if 0
1315          /*
1316           * Power of two multiplication by directly manipulating the exponent.
1317           *
1318           * XXX: This might not be always faster, it will introduce a small error
1319           * for multiplication by zero, and it will produce wrong results
1320           * for Inf and NaN.
1321           */
1322          unsigned mantissa = lp_mantissa(bld->type);
1323          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1324          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1325          a = LLVMBuildAdd(builder, a, factor, "");
1326          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1327          return a;
1328 #endif
1329       }
1330       else {
1331          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1332          return LLVMBuildShl(builder, a, factor, "");
1333       }
1334    }
1335 
1336    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1337    return lp_build_mul(bld, a, factor);
1338 }
1339 
1340 
1341 /**
1342  * Generate a / b
1343  */
1344 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1345 lp_build_div(struct lp_build_context *bld,
1346              LLVMValueRef a,
1347              LLVMValueRef b)
1348 {
1349    LLVMBuilderRef builder = bld->gallivm->builder;
1350    const struct lp_type type = bld->type;
1351 
1352    assert(lp_check_value(type, a));
1353    assert(lp_check_value(type, b));
1354 
1355    if(a == bld->zero)
1356       return bld->zero;
1357    if(a == bld->one && type.floating)
1358       return lp_build_rcp(bld, b);
1359    if(b == bld->zero)
1360       return bld->undef;
1361    if(b == bld->one)
1362       return a;
1363    if(a == bld->undef || b == bld->undef)
1364       return bld->undef;
1365 
1366    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1367       if (type.floating)
1368          return LLVMConstFDiv(a, b);
1369       else if (type.sign)
1370          return LLVMConstSDiv(a, b);
1371       else
1372          return LLVMConstUDiv(a, b);
1373    }
1374 
1375    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1376    if(FALSE &&
1377       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1378        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1379       type.floating)
1380       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1381 
1382    if (type.floating)
1383       return LLVMBuildFDiv(builder, a, b, "");
1384    else if (type.sign)
1385       return LLVMBuildSDiv(builder, a, b, "");
1386    else
1387       return LLVMBuildUDiv(builder, a, b, "");
1388 }
1389 
1390 
1391 /**
1392  * Linear interpolation helper.
1393  *
1394  * @param normalized whether we are interpolating normalized values,
1395  *        encoded in normalized integers, twice as wide.
1396  *
1397  * @sa http://www.stereopsis.com/doubleblend.html
1398  */
1399 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1400 lp_build_lerp_simple(struct lp_build_context *bld,
1401                      LLVMValueRef x,
1402                      LLVMValueRef v0,
1403                      LLVMValueRef v1,
1404                      unsigned flags)
1405 {
1406    unsigned half_width = bld->type.width/2;
1407    LLVMBuilderRef builder = bld->gallivm->builder;
1408    LLVMValueRef delta;
1409    LLVMValueRef res;
1410 
1411    assert(lp_check_value(bld->type, x));
1412    assert(lp_check_value(bld->type, v0));
1413    assert(lp_check_value(bld->type, v1));
1414 
1415    delta = lp_build_sub(bld, v1, v0);
1416 
1417    if (bld->type.floating) {
1418       assert(flags == 0);
1419       return lp_build_mad(bld, x, delta, v0);
1420    }
1421 
1422    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1423       if (!bld->type.sign) {
1424          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1425             /*
1426              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1427              * most-significant-bit to the lowest-significant-bit, so that
1428              * later we can just divide by 2**n instead of 2**n - 1.
1429              */
1430 
1431             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1432          }
1433 
1434          /* (x * delta) >> n */
1435          res = lp_build_mul(bld, x, delta);
1436          res = lp_build_shr_imm(bld, res, half_width);
1437       } else {
1438          /*
1439           * The rescaling trick above doesn't work for signed numbers, so
1440           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1441           * instead.
1442           */
1443          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1444          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1445       }
1446    } else {
1447       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1448       res = lp_build_mul(bld, x, delta);
1449    }
1450 
1451    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1452       /*
1453        * At this point both res and v0 only use the lower half of the bits,
1454        * the rest is zero. Instead of add / mask, do add with half wide type.
1455        */
1456       struct lp_type narrow_type;
1457       struct lp_build_context narrow_bld;
1458 
1459       memset(&narrow_type, 0, sizeof narrow_type);
1460       narrow_type.sign   = bld->type.sign;
1461       narrow_type.width  = bld->type.width/2;
1462       narrow_type.length = bld->type.length*2;
1463 
1464       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1465       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1466       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1467       res = lp_build_add(&narrow_bld, v0, res);
1468       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1469    } else {
1470       res = lp_build_add(bld, v0, res);
1471 
1472       if (bld->type.fixed) {
1473          /*
1474           * We need to mask out the high order bits when lerping 8bit
1475           * normalized colors stored on 16bits
1476           */
1477          /* XXX: This step is necessary for lerping 8bit colors stored on
1478           * 16bits, but it will be wrong for true fixed point use cases.
1479           * Basically we need a more powerful lp_type, capable of further
1480           * distinguishing the values interpretation from the value storage.
1481           */
1482          LLVMValueRef low_bits;
1483          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1484          res = LLVMBuildAnd(builder, res, low_bits, "");
1485       }
1486    }
1487 
1488    return res;
1489 }
1490 
1491 
1492 /**
1493  * Linear interpolation.
1494  */
1495 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1496 lp_build_lerp(struct lp_build_context *bld,
1497               LLVMValueRef x,
1498               LLVMValueRef v0,
1499               LLVMValueRef v1,
1500               unsigned flags)
1501 {
1502    const struct lp_type type = bld->type;
1503    LLVMValueRef res;
1504 
1505    assert(lp_check_value(type, x));
1506    assert(lp_check_value(type, v0));
1507    assert(lp_check_value(type, v1));
1508 
1509    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1510 
1511    if (type.norm) {
1512       struct lp_type wide_type;
1513       struct lp_build_context wide_bld;
1514       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1515 
1516       assert(type.length >= 2);
1517 
1518       /*
1519        * Create a wider integer type, enough to hold the
1520        * intermediate result of the multiplication.
1521        */
1522       memset(&wide_type, 0, sizeof wide_type);
1523       wide_type.sign   = type.sign;
1524       wide_type.width  = type.width*2;
1525       wide_type.length = type.length/2;
1526 
1527       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1528 
1529       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1530       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1531       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1532 
1533       /*
1534        * Lerp both halves.
1535        */
1536 
1537       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1538 
1539       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1540       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1541 
1542       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1543    } else {
1544       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1545    }
1546 
1547    return res;
1548 }
1549 
1550 
1551 /**
1552  * Bilinear interpolation.
1553  *
1554  * Values indices are in v_{yx}.
1555  */
1556 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1557 lp_build_lerp_2d(struct lp_build_context *bld,
1558                  LLVMValueRef x,
1559                  LLVMValueRef y,
1560                  LLVMValueRef v00,
1561                  LLVMValueRef v01,
1562                  LLVMValueRef v10,
1563                  LLVMValueRef v11,
1564                  unsigned flags)
1565 {
1566    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1567    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1568    return lp_build_lerp(bld, y, v0, v1, flags);
1569 }
1570 
1571 
1572 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1573 lp_build_lerp_3d(struct lp_build_context *bld,
1574                  LLVMValueRef x,
1575                  LLVMValueRef y,
1576                  LLVMValueRef z,
1577                  LLVMValueRef v000,
1578                  LLVMValueRef v001,
1579                  LLVMValueRef v010,
1580                  LLVMValueRef v011,
1581                  LLVMValueRef v100,
1582                  LLVMValueRef v101,
1583                  LLVMValueRef v110,
1584                  LLVMValueRef v111,
1585                  unsigned flags)
1586 {
1587    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1588    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1589    return lp_build_lerp(bld, z, v0, v1, flags);
1590 }
1591 
1592 
1593 /**
1594  * Generate min(a, b)
1595  * Do checks for special cases but not for nans.
1596  */
1597 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1598 lp_build_min(struct lp_build_context *bld,
1599              LLVMValueRef a,
1600              LLVMValueRef b)
1601 {
1602    assert(lp_check_value(bld->type, a));
1603    assert(lp_check_value(bld->type, b));
1604 
1605    if(a == bld->undef || b == bld->undef)
1606       return bld->undef;
1607 
1608    if(a == b)
1609       return a;
1610 
1611    if (bld->type.norm) {
1612       if (!bld->type.sign) {
1613          if (a == bld->zero || b == bld->zero) {
1614             return bld->zero;
1615          }
1616       }
1617       if(a == bld->one)
1618          return b;
1619       if(b == bld->one)
1620          return a;
1621    }
1622 
1623    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1624 }
1625 
1626 
1627 /**
1628  * Generate min(a, b)
1629  * NaN's are handled according to the behavior specified by the
1630  * nan_behavior argument.
1631  */
1632 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1633 lp_build_min_ext(struct lp_build_context *bld,
1634                  LLVMValueRef a,
1635                  LLVMValueRef b,
1636                  enum gallivm_nan_behavior nan_behavior)
1637 {
1638    assert(lp_check_value(bld->type, a));
1639    assert(lp_check_value(bld->type, b));
1640 
1641    if(a == bld->undef || b == bld->undef)
1642       return bld->undef;
1643 
1644    if(a == b)
1645       return a;
1646 
1647    if (bld->type.norm) {
1648       if (!bld->type.sign) {
1649          if (a == bld->zero || b == bld->zero) {
1650             return bld->zero;
1651          }
1652       }
1653       if(a == bld->one)
1654          return b;
1655       if(b == bld->one)
1656          return a;
1657    }
1658 
1659    return lp_build_min_simple(bld, a, b, nan_behavior);
1660 }
1661 
1662 /**
1663  * Generate max(a, b)
1664  * Do checks for special cases, but NaN behavior is undefined.
1665  */
1666 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1667 lp_build_max(struct lp_build_context *bld,
1668              LLVMValueRef a,
1669              LLVMValueRef b)
1670 {
1671    assert(lp_check_value(bld->type, a));
1672    assert(lp_check_value(bld->type, b));
1673 
1674    if(a == bld->undef || b == bld->undef)
1675       return bld->undef;
1676 
1677    if(a == b)
1678       return a;
1679 
1680    if(bld->type.norm) {
1681       if(a == bld->one || b == bld->one)
1682          return bld->one;
1683       if (!bld->type.sign) {
1684          if (a == bld->zero) {
1685             return b;
1686          }
1687          if (b == bld->zero) {
1688             return a;
1689          }
1690       }
1691    }
1692 
1693    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1694 }
1695 
1696 
1697 /**
1698  * Generate max(a, b)
1699  * Checks for special cases.
1700  * NaN's are handled according to the behavior specified by the
1701  * nan_behavior argument.
1702  */
1703 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1704 lp_build_max_ext(struct lp_build_context *bld,
1705                   LLVMValueRef a,
1706                   LLVMValueRef b,
1707                   enum gallivm_nan_behavior nan_behavior)
1708 {
1709    assert(lp_check_value(bld->type, a));
1710    assert(lp_check_value(bld->type, b));
1711 
1712    if(a == bld->undef || b == bld->undef)
1713       return bld->undef;
1714 
1715    if(a == b)
1716       return a;
1717 
1718    if(bld->type.norm) {
1719       if(a == bld->one || b == bld->one)
1720          return bld->one;
1721       if (!bld->type.sign) {
1722          if (a == bld->zero) {
1723             return b;
1724          }
1725          if (b == bld->zero) {
1726             return a;
1727          }
1728       }
1729    }
1730 
1731    return lp_build_max_simple(bld, a, b, nan_behavior);
1732 }
1733 
1734 /**
1735  * Generate clamp(a, min, max)
1736  * NaN behavior (for any of a, min, max) is undefined.
1737  * Do checks for special cases.
1738  */
1739 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1740 lp_build_clamp(struct lp_build_context *bld,
1741                LLVMValueRef a,
1742                LLVMValueRef min,
1743                LLVMValueRef max)
1744 {
1745    assert(lp_check_value(bld->type, a));
1746    assert(lp_check_value(bld->type, min));
1747    assert(lp_check_value(bld->type, max));
1748 
1749    a = lp_build_min(bld, a, max);
1750    a = lp_build_max(bld, a, min);
1751    return a;
1752 }
1753 
1754 
1755 /**
1756  * Generate clamp(a, 0, 1)
1757  * A NaN will get converted to zero.
1758  */
1759 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1760 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1761                                 LLVMValueRef a)
1762 {
1763    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1764    a = lp_build_min(bld, a, bld->one);
1765    return a;
1766 }
1767 
1768 
1769 /**
1770  * Generate abs(a)
1771  */
1772 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1773 lp_build_abs(struct lp_build_context *bld,
1774              LLVMValueRef a)
1775 {
1776    LLVMBuilderRef builder = bld->gallivm->builder;
1777    const struct lp_type type = bld->type;
1778    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1779 
1780    assert(lp_check_value(type, a));
1781 
1782    if(!type.sign)
1783       return a;
1784 
1785    if(type.floating) {
1786       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1787          /* Workaround llvm.org/PR27332 */
1788          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1789          unsigned long long absMask = ~(1ULL << (type.width - 1));
1790          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1791          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1792          a = LLVMBuildAnd(builder, a, mask, "");
1793          a = LLVMBuildBitCast(builder, a, vec_type, "");
1794          return a;
1795       } else {
1796          char intrinsic[32];
1797          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1798          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1799       }
1800    }
1801 
1802    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1803       switch(type.width) {
1804       case 8:
1805          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1806       case 16:
1807          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1808       case 32:
1809          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1810       }
1811    }
1812    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1813       switch(type.width) {
1814       case 8:
1815          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1816       case 16:
1817          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1818       case 32:
1819          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1820       }
1821    }
1822 
1823    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1824                           a, LLVMBuildNeg(builder, a, ""));
1825 }
1826 
1827 
1828 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1829 lp_build_negate(struct lp_build_context *bld,
1830                 LLVMValueRef a)
1831 {
1832    LLVMBuilderRef builder = bld->gallivm->builder;
1833 
1834    assert(lp_check_value(bld->type, a));
1835 
1836    if (bld->type.floating)
1837       a = LLVMBuildFNeg(builder, a, "");
1838    else
1839       a = LLVMBuildNeg(builder, a, "");
1840 
1841    return a;
1842 }
1843 
1844 
1845 /** Return -1, 0 or +1 depending on the sign of a */
1846 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1847 lp_build_sgn(struct lp_build_context *bld,
1848              LLVMValueRef a)
1849 {
1850    LLVMBuilderRef builder = bld->gallivm->builder;
1851    const struct lp_type type = bld->type;
1852    LLVMValueRef cond;
1853    LLVMValueRef res;
1854 
1855    assert(lp_check_value(type, a));
1856 
1857    /* Handle non-zero case */
1858    if(!type.sign) {
1859       /* if not zero then sign must be positive */
1860       res = bld->one;
1861    }
1862    else if(type.floating) {
1863       LLVMTypeRef vec_type;
1864       LLVMTypeRef int_type;
1865       LLVMValueRef mask;
1866       LLVMValueRef sign;
1867       LLVMValueRef one;
1868       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1869 
1870       int_type = lp_build_int_vec_type(bld->gallivm, type);
1871       vec_type = lp_build_vec_type(bld->gallivm, type);
1872       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1873 
1874       /* Take the sign bit and add it to 1 constant */
1875       sign = LLVMBuildBitCast(builder, a, int_type, "");
1876       sign = LLVMBuildAnd(builder, sign, mask, "");
1877       one = LLVMConstBitCast(bld->one, int_type);
1878       res = LLVMBuildOr(builder, sign, one, "");
1879       res = LLVMBuildBitCast(builder, res, vec_type, "");
1880    }
1881    else
1882    {
1883       /* signed int/norm/fixed point */
1884       /* could use psign with sse3 and appropriate vectors here */
1885       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1886       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1887       res = lp_build_select(bld, cond, bld->one, minus_one);
1888    }
1889 
1890    /* Handle zero */
1891    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1892    res = lp_build_select(bld, cond, bld->zero, res);
1893 
1894    return res;
1895 }
1896 
1897 
1898 /**
1899  * Set the sign of float vector 'a' according to 'sign'.
1900  * If sign==0, return abs(a).
1901  * If sign==1, return -abs(a);
1902  * Other values for sign produce undefined results.
1903  */
1904 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1905 lp_build_set_sign(struct lp_build_context *bld,
1906                   LLVMValueRef a, LLVMValueRef sign)
1907 {
1908    LLVMBuilderRef builder = bld->gallivm->builder;
1909    const struct lp_type type = bld->type;
1910    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1911    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1912    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1913    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1914                              ~((unsigned long long) 1 << (type.width - 1)));
1915    LLVMValueRef val, res;
1916 
1917    assert(type.floating);
1918    assert(lp_check_value(type, a));
1919 
1920    /* val = reinterpret_cast<int>(a) */
1921    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1922    /* val = val & mask */
1923    val = LLVMBuildAnd(builder, val, mask, "");
1924    /* sign = sign << shift */
1925    sign = LLVMBuildShl(builder, sign, shift, "");
1926    /* res = val | sign */
1927    res = LLVMBuildOr(builder, val, sign, "");
1928    /* res = reinterpret_cast<float>(res) */
1929    res = LLVMBuildBitCast(builder, res, vec_type, "");
1930 
1931    return res;
1932 }
1933 
1934 
1935 /**
1936  * Convert vector of (or scalar) int to vector of (or scalar) float.
1937  */
1938 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1939 lp_build_int_to_float(struct lp_build_context *bld,
1940                       LLVMValueRef a)
1941 {
1942    LLVMBuilderRef builder = bld->gallivm->builder;
1943    const struct lp_type type = bld->type;
1944    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1945 
1946    assert(type.floating);
1947 
1948    return LLVMBuildSIToFP(builder, a, vec_type, "");
1949 }
1950 
1951 static boolean
arch_rounding_available(const struct lp_type type)1952 arch_rounding_available(const struct lp_type type)
1953 {
1954    if ((util_cpu_caps.has_sse4_1 &&
1955        (type.length == 1 || type.width*type.length == 128)) ||
1956        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1957        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1958       return TRUE;
1959    else if ((util_cpu_caps.has_altivec &&
1960             (type.width == 32 && type.length == 4)))
1961       return TRUE;
1962 
1963    return FALSE;
1964 }
1965 
1966 enum lp_build_round_mode
1967 {
1968    LP_BUILD_ROUND_NEAREST = 0,
1969    LP_BUILD_ROUND_FLOOR = 1,
1970    LP_BUILD_ROUND_CEIL = 2,
1971    LP_BUILD_ROUND_TRUNCATE = 3
1972 };
1973 
1974 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1975 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1976                              LLVMValueRef a)
1977 {
1978    LLVMBuilderRef builder = bld->gallivm->builder;
1979    const struct lp_type type = bld->type;
1980    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1981    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1982    const char *intrinsic;
1983    LLVMValueRef res;
1984 
1985    assert(type.floating);
1986    /* using the double precision conversions is a bit more complicated */
1987    assert(type.width == 32);
1988 
1989    assert(lp_check_value(type, a));
1990    assert(util_cpu_caps.has_sse2);
1991 
1992    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1993    if (type.length == 1) {
1994       LLVMTypeRef vec_type;
1995       LLVMValueRef undef;
1996       LLVMValueRef arg;
1997       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1998 
1999       vec_type = LLVMVectorType(bld->elem_type, 4);
2000 
2001       intrinsic = "llvm.x86.sse.cvtss2si";
2002 
2003       undef = LLVMGetUndef(vec_type);
2004 
2005       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2006 
2007       res = lp_build_intrinsic_unary(builder, intrinsic,
2008                                      ret_type, arg);
2009    }
2010    else {
2011       if (type.width* type.length == 128) {
2012          intrinsic = "llvm.x86.sse2.cvtps2dq";
2013       }
2014       else {
2015          assert(type.width*type.length == 256);
2016          assert(util_cpu_caps.has_avx);
2017 
2018          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2019       }
2020       res = lp_build_intrinsic_unary(builder, intrinsic,
2021                                      ret_type, a);
2022    }
2023 
2024    return res;
2025 }
2026 
2027 
2028 /*
2029  */
2030 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)2031 lp_build_round_altivec(struct lp_build_context *bld,
2032                        LLVMValueRef a,
2033                        enum lp_build_round_mode mode)
2034 {
2035    LLVMBuilderRef builder = bld->gallivm->builder;
2036    const struct lp_type type = bld->type;
2037    const char *intrinsic = NULL;
2038 
2039    assert(type.floating);
2040 
2041    assert(lp_check_value(type, a));
2042    assert(util_cpu_caps.has_altivec);
2043 
2044    (void)type;
2045 
2046    switch (mode) {
2047    case LP_BUILD_ROUND_NEAREST:
2048       intrinsic = "llvm.ppc.altivec.vrfin";
2049       break;
2050    case LP_BUILD_ROUND_FLOOR:
2051       intrinsic = "llvm.ppc.altivec.vrfim";
2052       break;
2053    case LP_BUILD_ROUND_CEIL:
2054       intrinsic = "llvm.ppc.altivec.vrfip";
2055       break;
2056    case LP_BUILD_ROUND_TRUNCATE:
2057       intrinsic = "llvm.ppc.altivec.vrfiz";
2058       break;
2059    }
2060 
2061    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2062 }
2063 
2064 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)2065 lp_build_round_arch(struct lp_build_context *bld,
2066                     LLVMValueRef a,
2067                     enum lp_build_round_mode mode)
2068 {
2069    if (util_cpu_caps.has_sse4_1) {
2070       LLVMBuilderRef builder = bld->gallivm->builder;
2071       const struct lp_type type = bld->type;
2072       const char *intrinsic_root;
2073       char intrinsic[32];
2074 
2075       assert(type.floating);
2076       assert(lp_check_value(type, a));
2077       (void)type;
2078 
2079       switch (mode) {
2080       case LP_BUILD_ROUND_NEAREST:
2081          intrinsic_root = "llvm.nearbyint";
2082          break;
2083       case LP_BUILD_ROUND_FLOOR:
2084          intrinsic_root = "llvm.floor";
2085          break;
2086       case LP_BUILD_ROUND_CEIL:
2087          intrinsic_root = "llvm.ceil";
2088          break;
2089       case LP_BUILD_ROUND_TRUNCATE:
2090          intrinsic_root = "llvm.trunc";
2091          break;
2092       }
2093 
2094       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2095       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2096    }
2097    else /* (util_cpu_caps.has_altivec) */
2098      return lp_build_round_altivec(bld, a, mode);
2099 }
2100 
2101 /**
2102  * Return the integer part of a float (vector) value (== round toward zero).
2103  * The returned value is a float (vector).
2104  * Ex: trunc(-1.5) = -1.0
2105  */
2106 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2107 lp_build_trunc(struct lp_build_context *bld,
2108                LLVMValueRef a)
2109 {
2110    LLVMBuilderRef builder = bld->gallivm->builder;
2111    const struct lp_type type = bld->type;
2112 
2113    assert(type.floating);
2114    assert(lp_check_value(type, a));
2115 
2116    if (arch_rounding_available(type)) {
2117       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2118    }
2119    else {
2120       const struct lp_type type = bld->type;
2121       struct lp_type inttype;
2122       struct lp_build_context intbld;
2123       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2124       LLVMValueRef trunc, res, anosign, mask;
2125       LLVMTypeRef int_vec_type = bld->int_vec_type;
2126       LLVMTypeRef vec_type = bld->vec_type;
2127 
2128       assert(type.width == 32); /* might want to handle doubles at some point */
2129 
2130       inttype = type;
2131       inttype.floating = 0;
2132       lp_build_context_init(&intbld, bld->gallivm, inttype);
2133 
2134       /* round by truncation */
2135       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2136       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2137 
2138       /* mask out sign bit */
2139       anosign = lp_build_abs(bld, a);
2140       /*
2141        * mask out all values if anosign > 2^24
2142        * This should work both for large ints (all rounding is no-op for them
2143        * because such floats are always exact) as well as special cases like
2144        * NaNs, Infs (taking advantage of the fact they use max exponent).
2145        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2146        */
2147       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2148       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2149       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2150       return lp_build_select(bld, mask, a, res);
2151    }
2152 }
2153 
2154 
2155 /**
2156  * Return float (vector) rounded to nearest integer (vector).  The returned
2157  * value is a float (vector).
2158  * Ex: round(0.9) = 1.0
2159  * Ex: round(-1.5) = -2.0
2160  */
2161 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2162 lp_build_round(struct lp_build_context *bld,
2163                LLVMValueRef a)
2164 {
2165    LLVMBuilderRef builder = bld->gallivm->builder;
2166    const struct lp_type type = bld->type;
2167 
2168    assert(type.floating);
2169    assert(lp_check_value(type, a));
2170 
2171    if (arch_rounding_available(type)) {
2172       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2173    }
2174    else {
2175       const struct lp_type type = bld->type;
2176       struct lp_type inttype;
2177       struct lp_build_context intbld;
2178       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2179       LLVMValueRef res, anosign, mask;
2180       LLVMTypeRef int_vec_type = bld->int_vec_type;
2181       LLVMTypeRef vec_type = bld->vec_type;
2182 
2183       assert(type.width == 32); /* might want to handle doubles at some point */
2184 
2185       inttype = type;
2186       inttype.floating = 0;
2187       lp_build_context_init(&intbld, bld->gallivm, inttype);
2188 
2189       res = lp_build_iround(bld, a);
2190       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2191 
2192       /* mask out sign bit */
2193       anosign = lp_build_abs(bld, a);
2194       /*
2195        * mask out all values if anosign > 2^24
2196        * This should work both for large ints (all rounding is no-op for them
2197        * because such floats are always exact) as well as special cases like
2198        * NaNs, Infs (taking advantage of the fact they use max exponent).
2199        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2200        */
2201       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2202       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2203       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2204       return lp_build_select(bld, mask, a, res);
2205    }
2206 }
2207 
2208 
2209 /**
2210  * Return floor of float (vector), result is a float (vector)
2211  * Ex: floor(1.1) = 1.0
2212  * Ex: floor(-1.1) = -2.0
2213  */
2214 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2215 lp_build_floor(struct lp_build_context *bld,
2216                LLVMValueRef a)
2217 {
2218    LLVMBuilderRef builder = bld->gallivm->builder;
2219    const struct lp_type type = bld->type;
2220 
2221    assert(type.floating);
2222    assert(lp_check_value(type, a));
2223 
2224    if (arch_rounding_available(type)) {
2225       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2226    }
2227    else {
2228       const struct lp_type type = bld->type;
2229       struct lp_type inttype;
2230       struct lp_build_context intbld;
2231       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2232       LLVMValueRef trunc, res, anosign, mask;
2233       LLVMTypeRef int_vec_type = bld->int_vec_type;
2234       LLVMTypeRef vec_type = bld->vec_type;
2235 
2236       if (type.width != 32) {
2237          char intrinsic[32];
2238          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2239          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2240       }
2241 
2242       assert(type.width == 32); /* might want to handle doubles at some point */
2243 
2244       inttype = type;
2245       inttype.floating = 0;
2246       lp_build_context_init(&intbld, bld->gallivm, inttype);
2247 
2248       /* round by truncation */
2249       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2250       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2251 
2252       if (type.sign) {
2253          LLVMValueRef tmp;
2254 
2255          /*
2256           * fix values if rounding is wrong (for non-special cases)
2257           * - this is the case if trunc > a
2258           */
2259          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2260          /* tmp = trunc > a ? 1.0 : 0.0 */
2261          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2262          tmp = lp_build_and(&intbld, mask, tmp);
2263          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2264          res = lp_build_sub(bld, res, tmp);
2265       }
2266 
2267       /* mask out sign bit */
2268       anosign = lp_build_abs(bld, a);
2269       /*
2270        * mask out all values if anosign > 2^24
2271        * This should work both for large ints (all rounding is no-op for them
2272        * because such floats are always exact) as well as special cases like
2273        * NaNs, Infs (taking advantage of the fact they use max exponent).
2274        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2275        */
2276       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2277       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2278       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2279       return lp_build_select(bld, mask, a, res);
2280    }
2281 }
2282 
2283 
2284 /**
2285  * Return ceiling of float (vector), returning float (vector).
2286  * Ex: ceil( 1.1) = 2.0
2287  * Ex: ceil(-1.1) = -1.0
2288  */
2289 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2290 lp_build_ceil(struct lp_build_context *bld,
2291               LLVMValueRef a)
2292 {
2293    LLVMBuilderRef builder = bld->gallivm->builder;
2294    const struct lp_type type = bld->type;
2295 
2296    assert(type.floating);
2297    assert(lp_check_value(type, a));
2298 
2299    if (arch_rounding_available(type)) {
2300       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2301    }
2302    else {
2303       const struct lp_type type = bld->type;
2304       struct lp_type inttype;
2305       struct lp_build_context intbld;
2306       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2307       LLVMValueRef trunc, res, anosign, mask, tmp;
2308       LLVMTypeRef int_vec_type = bld->int_vec_type;
2309       LLVMTypeRef vec_type = bld->vec_type;
2310 
2311       if (type.width != 32) {
2312          char intrinsic[32];
2313          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2314          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2315       }
2316 
2317       assert(type.width == 32); /* might want to handle doubles at some point */
2318 
2319       inttype = type;
2320       inttype.floating = 0;
2321       lp_build_context_init(&intbld, bld->gallivm, inttype);
2322 
2323       /* round by truncation */
2324       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2325       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2326 
2327       /*
2328        * fix values if rounding is wrong (for non-special cases)
2329        * - this is the case if trunc < a
2330        */
2331       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2332       /* tmp = trunc < a ? 1.0 : 0.0 */
2333       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2334       tmp = lp_build_and(&intbld, mask, tmp);
2335       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2336       res = lp_build_add(bld, trunc, tmp);
2337 
2338       /* mask out sign bit */
2339       anosign = lp_build_abs(bld, a);
2340       /*
2341        * mask out all values if anosign > 2^24
2342        * This should work both for large ints (all rounding is no-op for them
2343        * because such floats are always exact) as well as special cases like
2344        * NaNs, Infs (taking advantage of the fact they use max exponent).
2345        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2346        */
2347       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2348       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2349       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2350       return lp_build_select(bld, mask, a, res);
2351    }
2352 }
2353 
2354 
2355 /**
2356  * Return fractional part of 'a' computed as a - floor(a)
2357  * Typically used in texture coord arithmetic.
2358  */
2359 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2360 lp_build_fract(struct lp_build_context *bld,
2361                LLVMValueRef a)
2362 {
2363    assert(bld->type.floating);
2364    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2365 }
2366 
2367 
2368 /**
2369  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2370  * against 0.99999(9). (Will also return that value for NaNs.)
2371  */
2372 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2373 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2374 {
2375    LLVMValueRef max;
2376 
2377    /* this is the largest number smaller than 1.0 representable as float */
2378    max = lp_build_const_vec(bld->gallivm, bld->type,
2379                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2380    return lp_build_min_ext(bld, fract, max,
2381                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2382 }
2383 
2384 
2385 /**
2386  * Same as lp_build_fract, but guarantees that the result is always smaller
2387  * than one. Will also return the smaller-than-one value for infs, NaNs.
2388  */
2389 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2390 lp_build_fract_safe(struct lp_build_context *bld,
2391                     LLVMValueRef a)
2392 {
2393    return clamp_fract(bld, lp_build_fract(bld, a));
2394 }
2395 
2396 
2397 /**
2398  * Return the integer part of a float (vector) value (== round toward zero).
2399  * The returned value is an integer (vector).
2400  * Ex: itrunc(-1.5) = -1
2401  */
2402 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2403 lp_build_itrunc(struct lp_build_context *bld,
2404                 LLVMValueRef a)
2405 {
2406    LLVMBuilderRef builder = bld->gallivm->builder;
2407    const struct lp_type type = bld->type;
2408    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2409 
2410    assert(type.floating);
2411    assert(lp_check_value(type, a));
2412 
2413    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2414 }
2415 
2416 
2417 /**
2418  * Return float (vector) rounded to nearest integer (vector).  The returned
2419  * value is an integer (vector).
2420  * Ex: iround(0.9) = 1
2421  * Ex: iround(-1.5) = -2
2422  */
2423 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2424 lp_build_iround(struct lp_build_context *bld,
2425                 LLVMValueRef a)
2426 {
2427    LLVMBuilderRef builder = bld->gallivm->builder;
2428    const struct lp_type type = bld->type;
2429    LLVMTypeRef int_vec_type = bld->int_vec_type;
2430    LLVMValueRef res;
2431 
2432    assert(type.floating);
2433 
2434    assert(lp_check_value(type, a));
2435 
2436    if ((util_cpu_caps.has_sse2 &&
2437        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2438        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2439       return lp_build_iround_nearest_sse2(bld, a);
2440    }
2441    if (arch_rounding_available(type)) {
2442       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2443    }
2444    else {
2445       LLVMValueRef half;
2446 
2447       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2448 
2449       if (type.sign) {
2450          LLVMTypeRef vec_type = bld->vec_type;
2451          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2452                                     (unsigned long long)1 << (type.width - 1));
2453          LLVMValueRef sign;
2454 
2455          /* get sign bit */
2456          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2457          sign = LLVMBuildAnd(builder, sign, mask, "");
2458 
2459          /* sign * 0.5 */
2460          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2461          half = LLVMBuildOr(builder, sign, half, "");
2462          half = LLVMBuildBitCast(builder, half, vec_type, "");
2463       }
2464 
2465       res = LLVMBuildFAdd(builder, a, half, "");
2466    }
2467 
2468    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2469 
2470    return res;
2471 }
2472 
2473 
2474 /**
2475  * Return floor of float (vector), result is an int (vector)
2476  * Ex: ifloor(1.1) = 1.0
2477  * Ex: ifloor(-1.1) = -2.0
2478  */
2479 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2480 lp_build_ifloor(struct lp_build_context *bld,
2481                 LLVMValueRef a)
2482 {
2483    LLVMBuilderRef builder = bld->gallivm->builder;
2484    const struct lp_type type = bld->type;
2485    LLVMTypeRef int_vec_type = bld->int_vec_type;
2486    LLVMValueRef res;
2487 
2488    assert(type.floating);
2489    assert(lp_check_value(type, a));
2490 
2491    res = a;
2492    if (type.sign) {
2493       if (arch_rounding_available(type)) {
2494          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2495       }
2496       else {
2497          struct lp_type inttype;
2498          struct lp_build_context intbld;
2499          LLVMValueRef trunc, itrunc, mask;
2500 
2501          assert(type.floating);
2502          assert(lp_check_value(type, a));
2503 
2504          inttype = type;
2505          inttype.floating = 0;
2506          lp_build_context_init(&intbld, bld->gallivm, inttype);
2507 
2508          /* round by truncation */
2509          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2510          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2511 
2512          /*
2513           * fix values if rounding is wrong (for non-special cases)
2514           * - this is the case if trunc > a
2515           * The results of doing this with NaNs, very large values etc.
2516           * are undefined but this seems to be the case anyway.
2517           */
2518          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2519          /* cheapie minus one with mask since the mask is minus one / zero */
2520          return lp_build_add(&intbld, itrunc, mask);
2521       }
2522    }
2523 
2524    /* round to nearest (toward zero) */
2525    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2526 
2527    return res;
2528 }
2529 
2530 
2531 /**
2532  * Return ceiling of float (vector), returning int (vector).
2533  * Ex: iceil( 1.1) = 2
2534  * Ex: iceil(-1.1) = -1
2535  */
2536 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2537 lp_build_iceil(struct lp_build_context *bld,
2538                LLVMValueRef a)
2539 {
2540    LLVMBuilderRef builder = bld->gallivm->builder;
2541    const struct lp_type type = bld->type;
2542    LLVMTypeRef int_vec_type = bld->int_vec_type;
2543    LLVMValueRef res;
2544 
2545    assert(type.floating);
2546    assert(lp_check_value(type, a));
2547 
2548    if (arch_rounding_available(type)) {
2549       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2550    }
2551    else {
2552       struct lp_type inttype;
2553       struct lp_build_context intbld;
2554       LLVMValueRef trunc, itrunc, mask;
2555 
2556       assert(type.floating);
2557       assert(lp_check_value(type, a));
2558 
2559       inttype = type;
2560       inttype.floating = 0;
2561       lp_build_context_init(&intbld, bld->gallivm, inttype);
2562 
2563       /* round by truncation */
2564       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2565       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2566 
2567       /*
2568        * fix values if rounding is wrong (for non-special cases)
2569        * - this is the case if trunc < a
2570        * The results of doing this with NaNs, very large values etc.
2571        * are undefined but this seems to be the case anyway.
2572        */
2573       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2574       /* cheapie plus one with mask since the mask is minus one / zero */
2575       return lp_build_sub(&intbld, itrunc, mask);
2576    }
2577 
2578    /* round to nearest (toward zero) */
2579    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2580 
2581    return res;
2582 }
2583 
2584 
2585 /**
2586  * Combined ifloor() & fract().
2587  *
2588  * Preferred to calling the functions separately, as it will ensure that the
2589  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2590  */
2591 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2592 lp_build_ifloor_fract(struct lp_build_context *bld,
2593                       LLVMValueRef a,
2594                       LLVMValueRef *out_ipart,
2595                       LLVMValueRef *out_fpart)
2596 {
2597    LLVMBuilderRef builder = bld->gallivm->builder;
2598    const struct lp_type type = bld->type;
2599    LLVMValueRef ipart;
2600 
2601    assert(type.floating);
2602    assert(lp_check_value(type, a));
2603 
2604    if (arch_rounding_available(type)) {
2605       /*
2606        * floor() is easier.
2607        */
2608 
2609       ipart = lp_build_floor(bld, a);
2610       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2611       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2612    }
2613    else {
2614       /*
2615        * ifloor() is easier.
2616        */
2617 
2618       *out_ipart = lp_build_ifloor(bld, a);
2619       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2620       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2621    }
2622 }
2623 
2624 
2625 /**
2626  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2627  * always smaller than one.
2628  */
2629 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2630 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2631                            LLVMValueRef a,
2632                            LLVMValueRef *out_ipart,
2633                            LLVMValueRef *out_fpart)
2634 {
2635    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2636    *out_fpart = clamp_fract(bld, *out_fpart);
2637 }
2638 
2639 
2640 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2641 lp_build_sqrt(struct lp_build_context *bld,
2642               LLVMValueRef a)
2643 {
2644    LLVMBuilderRef builder = bld->gallivm->builder;
2645    const struct lp_type type = bld->type;
2646    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2647    char intrinsic[32];
2648 
2649    assert(lp_check_value(type, a));
2650 
2651    assert(type.floating);
2652    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2653 
2654    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2655 }
2656 
2657 
2658 /**
2659  * Do one Newton-Raphson step to improve reciprocate precision:
2660  *
2661  *   x_{i+1} = x_i * (2 - a * x_i)
2662  *
2663  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2664  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2665  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2666  * halo. It would be necessary to clamp the argument to prevent this.
2667  *
2668  * See also:
2669  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2670  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2671  */
2672 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2673 lp_build_rcp_refine(struct lp_build_context *bld,
2674                     LLVMValueRef a,
2675                     LLVMValueRef rcp_a)
2676 {
2677    LLVMBuilderRef builder = bld->gallivm->builder;
2678    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2679    LLVMValueRef res;
2680 
2681    res = LLVMBuildFMul(builder, a, rcp_a, "");
2682    res = LLVMBuildFSub(builder, two, res, "");
2683    res = LLVMBuildFMul(builder, rcp_a, res, "");
2684 
2685    return res;
2686 }
2687 
2688 
2689 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2690 lp_build_rcp(struct lp_build_context *bld,
2691              LLVMValueRef a)
2692 {
2693    LLVMBuilderRef builder = bld->gallivm->builder;
2694    const struct lp_type type = bld->type;
2695 
2696    assert(lp_check_value(type, a));
2697 
2698    if(a == bld->zero)
2699       return bld->undef;
2700    if(a == bld->one)
2701       return bld->one;
2702    if(a == bld->undef)
2703       return bld->undef;
2704 
2705    assert(type.floating);
2706 
2707    if(LLVMIsConstant(a))
2708       return LLVMConstFDiv(bld->one, a);
2709 
2710    /*
2711     * We don't use RCPPS because:
2712     * - it only has 10bits of precision
2713     * - it doesn't even get the reciprocate of 1.0 exactly
2714     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2715     * - for recent processors the benefit over DIVPS is marginal, a case
2716     *   dependent
2717     *
2718     * We could still use it on certain processors if benchmarks show that the
2719     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2720     * particular uses that require less workarounds.
2721     */
2722 
2723    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2724          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2725       const unsigned num_iterations = 0;
2726       LLVMValueRef res;
2727       unsigned i;
2728       const char *intrinsic = NULL;
2729 
2730       if (type.length == 4) {
2731          intrinsic = "llvm.x86.sse.rcp.ps";
2732       }
2733       else {
2734          intrinsic = "llvm.x86.avx.rcp.ps.256";
2735       }
2736 
2737       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2738 
2739       for (i = 0; i < num_iterations; ++i) {
2740          res = lp_build_rcp_refine(bld, a, res);
2741       }
2742 
2743       return res;
2744    }
2745 
2746    return LLVMBuildFDiv(builder, bld->one, a, "");
2747 }
2748 
2749 
2750 /**
2751  * Do one Newton-Raphson step to improve rsqrt precision:
2752  *
2753  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2754  *
2755  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2756  */
2757 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2758 lp_build_rsqrt_refine(struct lp_build_context *bld,
2759                       LLVMValueRef a,
2760                       LLVMValueRef rsqrt_a)
2761 {
2762    LLVMBuilderRef builder = bld->gallivm->builder;
2763    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2764    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2765    LLVMValueRef res;
2766 
2767    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2768    res = LLVMBuildFMul(builder, a, res, "");
2769    res = LLVMBuildFSub(builder, three, res, "");
2770    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2771    res = LLVMBuildFMul(builder, half, res, "");
2772 
2773    return res;
2774 }
2775 
2776 
2777 /**
2778  * Generate 1/sqrt(a).
2779  * Result is undefined for values < 0, infinity for +0.
2780  */
2781 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2782 lp_build_rsqrt(struct lp_build_context *bld,
2783                LLVMValueRef a)
2784 {
2785    const struct lp_type type = bld->type;
2786 
2787    assert(lp_check_value(type, a));
2788 
2789    assert(type.floating);
2790 
2791    /*
2792     * This should be faster but all denormals will end up as infinity.
2793     */
2794    if (0 && lp_build_fast_rsqrt_available(type)) {
2795       const unsigned num_iterations = 1;
2796       LLVMValueRef res;
2797       unsigned i;
2798 
2799       /* rsqrt(1.0) != 1.0 here */
2800       res = lp_build_fast_rsqrt(bld, a);
2801 
2802       if (num_iterations) {
2803          /*
2804           * Newton-Raphson will result in NaN instead of infinity for zero,
2805           * and NaN instead of zero for infinity.
2806           * Also, need to ensure rsqrt(1.0) == 1.0.
2807           * All numbers smaller than FLT_MIN will result in +infinity
2808           * (rsqrtps treats all denormals as zero).
2809           */
2810          LLVMValueRef cmp;
2811          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2812          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2813 
2814          for (i = 0; i < num_iterations; ++i) {
2815             res = lp_build_rsqrt_refine(bld, a, res);
2816          }
2817          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2818          res = lp_build_select(bld, cmp, inf, res);
2819          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2820          res = lp_build_select(bld, cmp, bld->zero, res);
2821          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2822          res = lp_build_select(bld, cmp, bld->one, res);
2823       }
2824 
2825       return res;
2826    }
2827 
2828    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2829 }
2830 
2831 /**
2832  * If there's a fast (inaccurate) rsqrt instruction available
2833  * (caller may want to avoid to call rsqrt_fast if it's not available,
2834  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2835  * unavailable it would result in sqrt/div/mul so obviously
2836  * much better to just call sqrt, skipping both div and mul).
2837  */
2838 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2839 lp_build_fast_rsqrt_available(struct lp_type type)
2840 {
2841    assert(type.floating);
2842 
2843    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2844        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2845       return true;
2846    }
2847    return false;
2848 }
2849 
2850 
2851 /**
2852  * Generate 1/sqrt(a).
2853  * Result is undefined for values < 0, infinity for +0.
2854  * Precision is limited, only ~10 bits guaranteed
2855  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2856  */
2857 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2858 lp_build_fast_rsqrt(struct lp_build_context *bld,
2859                     LLVMValueRef a)
2860 {
2861    LLVMBuilderRef builder = bld->gallivm->builder;
2862    const struct lp_type type = bld->type;
2863 
2864    assert(lp_check_value(type, a));
2865 
2866    if (lp_build_fast_rsqrt_available(type)) {
2867       const char *intrinsic = NULL;
2868 
2869       if (type.length == 4) {
2870          intrinsic = "llvm.x86.sse.rsqrt.ps";
2871       }
2872       else {
2873          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2874       }
2875       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2876    }
2877    else {
2878       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2879    }
2880    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2881 }
2882 
2883 
2884 /**
2885  * Generate sin(a) or cos(a) using polynomial approximation.
2886  * TODO: it might be worth recognizing sin and cos using same source
2887  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2888  * would be way cheaper than calculating (nearly) everything twice...
2889  * Not sure it's common enough to be worth bothering however, scs
2890  * opcode could also benefit from calculating both though.
2891  */
2892 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2893 lp_build_sin_or_cos(struct lp_build_context *bld,
2894                     LLVMValueRef a,
2895                     boolean cos)
2896 {
2897    struct gallivm_state *gallivm = bld->gallivm;
2898    LLVMBuilderRef b = gallivm->builder;
2899    struct lp_type int_type = lp_int_type(bld->type);
2900 
2901    /*
2902     *  take the absolute value,
2903     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2904     */
2905 
2906    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2907    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2908 
2909    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2910    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2911 
2912    /*
2913     * scale by 4/Pi
2914     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2915     */
2916 
2917    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2918    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2919 
2920    /*
2921     * store the integer part of y in mm0
2922     * emm2 = _mm_cvttps_epi32(y);
2923     */
2924 
2925    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2926 
2927    /*
2928     * j=(j+1) & (~1) (see the cephes sources)
2929     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2930     */
2931 
2932    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2933    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2934    /*
2935     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2936     */
2937    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2938    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2939 
2940    /*
2941     * y = _mm_cvtepi32_ps(emm2);
2942     */
2943    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2944 
2945    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2946    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2947    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2948    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2949 
2950    /*
2951     * Argument used for poly selection and sign bit determination
2952     * is different for sin vs. cos.
2953     */
2954    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2955                                emm2_and;
2956 
2957    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2958                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2959                                               const_29, "sign_bit") :
2960                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2961                                                               LLVMBuildShl(b, emm2_add,
2962                                                                            const_29, ""), ""),
2963                                               sign_mask, "sign_bit");
2964 
2965    /*
2966     * get the polynom selection mask
2967     * there is one polynom for 0 <= x <= Pi/4
2968     * and another one for Pi/4<x<=Pi/2
2969     * Both branches will be computed.
2970     *
2971     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2972     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2973     */
2974 
2975    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2976    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2977                                              int_type, PIPE_FUNC_EQUAL,
2978                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2979 
2980    /*
2981     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2982     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2983     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2984     */
2985    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2986    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2987    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2988 
2989    /*
2990     * The magic pass: "Extended precision modular arithmetic"
2991     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2992     */
2993    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2994    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2995    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2996 
2997    /*
2998     * Evaluate the first polynom  (0 <= x <= Pi/4)
2999     *
3000     * z = _mm_mul_ps(x,x);
3001     */
3002    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3003 
3004    /*
3005     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3006     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3007     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3008     */
3009    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3010    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3011    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3012 
3013    /*
3014     * y = *(v4sf*)_ps_coscof_p0;
3015     * y = _mm_mul_ps(y, z);
3016     */
3017    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3018    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3019    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3020    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3021 
3022 
3023    /*
3024     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3025     * y = _mm_sub_ps(y, tmp);
3026     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3027     */
3028    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3029    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3030    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3031    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3032    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3033 
3034    /*
3035     * _PS_CONST(sincof_p0, -1.9515295891E-4);
3036     * _PS_CONST(sincof_p1,  8.3321608736E-3);
3037     * _PS_CONST(sincof_p2, -1.6666654611E-1);
3038     */
3039    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3040    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3041    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3042 
3043    /*
3044     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3045     *
3046     * y2 = *(v4sf*)_ps_sincof_p0;
3047     * y2 = _mm_mul_ps(y2, z);
3048     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3049     * y2 = _mm_mul_ps(y2, z);
3050     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3051     * y2 = _mm_mul_ps(y2, z);
3052     * y2 = _mm_mul_ps(y2, x);
3053     * y2 = _mm_add_ps(y2, x);
3054     */
3055 
3056    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3057    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3058    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3059    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3060 
3061    /*
3062     * select the correct result from the two polynoms
3063     * xmm3 = poly_mask;
3064     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3065     * y = _mm_andnot_ps(xmm3, y);
3066     * y = _mm_or_ps(y,y2);
3067     */
3068    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3069    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3070    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3071    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3072    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3073    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3074 
3075    /*
3076     * update the sign
3077     * y = _mm_xor_ps(y, sign_bit);
3078     */
3079    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3080    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3081 
3082    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3083 
3084    /* clamp output to be within [-1, 1] */
3085    y_result = lp_build_clamp(bld, y_result,
3086                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3087                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3088    /* If a is -inf, inf or NaN then return NaN */
3089    y_result = lp_build_select(bld, isfinite, y_result,
3090                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3091    return y_result;
3092 }
3093 
3094 
3095 /**
3096  * Generate sin(a)
3097  */
3098 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3099 lp_build_sin(struct lp_build_context *bld,
3100              LLVMValueRef a)
3101 {
3102    return lp_build_sin_or_cos(bld, a, FALSE);
3103 }
3104 
3105 
3106 /**
3107  * Generate cos(a)
3108  */
3109 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3110 lp_build_cos(struct lp_build_context *bld,
3111              LLVMValueRef a)
3112 {
3113    return lp_build_sin_or_cos(bld, a, TRUE);
3114 }
3115 
3116 
3117 /**
3118  * Generate pow(x, y)
3119  */
3120 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3121 lp_build_pow(struct lp_build_context *bld,
3122              LLVMValueRef x,
3123              LLVMValueRef y)
3124 {
3125    /* TODO: optimize the constant case */
3126    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3127        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3128       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3129                    __FUNCTION__);
3130    }
3131 
3132    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3133 }
3134 
3135 
3136 /**
3137  * Generate exp(x)
3138  */
3139 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3140 lp_build_exp(struct lp_build_context *bld,
3141              LLVMValueRef x)
3142 {
3143    /* log2(e) = 1/log(2) */
3144    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3145                                            1.4426950408889634);
3146 
3147    assert(lp_check_value(bld->type, x));
3148 
3149    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3150 }
3151 
3152 
3153 /**
3154  * Generate log(x)
3155  * Behavior is undefined with infs, 0s and nans
3156  */
3157 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3158 lp_build_log(struct lp_build_context *bld,
3159              LLVMValueRef x)
3160 {
3161    /* log(2) */
3162    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3163                                           0.69314718055994529);
3164 
3165    assert(lp_check_value(bld->type, x));
3166 
3167    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3168 }
3169 
3170 /**
3171  * Generate log(x) that handles edge cases (infs, 0s and nans)
3172  */
3173 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3174 lp_build_log_safe(struct lp_build_context *bld,
3175                   LLVMValueRef x)
3176 {
3177    /* log(2) */
3178    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3179                                           0.69314718055994529);
3180 
3181    assert(lp_check_value(bld->type, x));
3182 
3183    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3184 }
3185 
3186 
3187 /**
3188  * Generate polynomial.
3189  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3190  */
3191 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3192 lp_build_polynomial(struct lp_build_context *bld,
3193                     LLVMValueRef x,
3194                     const double *coeffs,
3195                     unsigned num_coeffs)
3196 {
3197    const struct lp_type type = bld->type;
3198    LLVMValueRef even = NULL, odd = NULL;
3199    LLVMValueRef x2;
3200    unsigned i;
3201 
3202    assert(lp_check_value(bld->type, x));
3203 
3204    /* TODO: optimize the constant case */
3205    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3206        LLVMIsConstant(x)) {
3207       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3208                    __FUNCTION__);
3209    }
3210 
3211    /*
3212     * Calculate odd and even terms seperately to decrease data dependency
3213     * Ex:
3214     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3215     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3216     */
3217    x2 = lp_build_mul(bld, x, x);
3218 
3219    for (i = num_coeffs; i--; ) {
3220       LLVMValueRef coeff;
3221 
3222       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3223 
3224       if (i % 2 == 0) {
3225          if (even)
3226             even = lp_build_mad(bld, x2, even, coeff);
3227          else
3228             even = coeff;
3229       } else {
3230          if (odd)
3231             odd = lp_build_mad(bld, x2, odd, coeff);
3232          else
3233             odd = coeff;
3234       }
3235    }
3236 
3237    if (odd)
3238       return lp_build_mad(bld, odd, x, even);
3239    else if (even)
3240       return even;
3241    else
3242       return bld->undef;
3243 }
3244 
3245 
3246 /**
3247  * Minimax polynomial fit of 2**x, in range [0, 1[
3248  */
3249 const double lp_build_exp2_polynomial[] = {
3250 #if EXP_POLY_DEGREE == 5
3251    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3252    0.693153073200168932794,
3253    0.240153617044375388211,
3254    0.0558263180532956664775,
3255    0.00898934009049466391101,
3256    0.00187757667519147912699
3257 #elif EXP_POLY_DEGREE == 4
3258    1.00000259337069434683,
3259    0.693003834469974940458,
3260    0.24144275689150793076,
3261    0.0520114606103070150235,
3262    0.0135341679161270268764
3263 #elif EXP_POLY_DEGREE == 3
3264    0.999925218562710312959,
3265    0.695833540494823811697,
3266    0.226067155427249155588,
3267    0.0780245226406372992967
3268 #elif EXP_POLY_DEGREE == 2
3269    1.00172476321474503578,
3270    0.657636275736077639316,
3271    0.33718943461968720704
3272 #else
3273 #error
3274 #endif
3275 };
3276 
3277 
3278 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3279 lp_build_exp2(struct lp_build_context *bld,
3280               LLVMValueRef x)
3281 {
3282    LLVMBuilderRef builder = bld->gallivm->builder;
3283    const struct lp_type type = bld->type;
3284    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3285    LLVMValueRef ipart = NULL;
3286    LLVMValueRef fpart = NULL;
3287    LLVMValueRef expipart = NULL;
3288    LLVMValueRef expfpart = NULL;
3289    LLVMValueRef res = NULL;
3290 
3291    assert(lp_check_value(bld->type, x));
3292 
3293    /* TODO: optimize the constant case */
3294    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3295        LLVMIsConstant(x)) {
3296       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3297                    __FUNCTION__);
3298    }
3299 
3300    assert(type.floating && type.width == 32);
3301 
3302    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3303     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3304    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3305                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3306    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3307                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3308 
3309    /* ipart = floor(x) */
3310    /* fpart = x - ipart */
3311    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3312 
3313    /* expipart = (float) (1 << ipart) */
3314    expipart = LLVMBuildAdd(builder, ipart,
3315                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3316    expipart = LLVMBuildShl(builder, expipart,
3317                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3318    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3319 
3320    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3321                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3322 
3323    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3324 
3325    return res;
3326 }
3327 
3328 
3329 
3330 /**
3331  * Extract the exponent of a IEEE-754 floating point value.
3332  *
3333  * Optionally apply an integer bias.
3334  *
3335  * Result is an integer value with
3336  *
3337  *   ifloor(log2(x)) + bias
3338  */
3339 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3340 lp_build_extract_exponent(struct lp_build_context *bld,
3341                           LLVMValueRef x,
3342                           int bias)
3343 {
3344    LLVMBuilderRef builder = bld->gallivm->builder;
3345    const struct lp_type type = bld->type;
3346    unsigned mantissa = lp_mantissa(type);
3347    LLVMValueRef res;
3348 
3349    assert(type.floating);
3350 
3351    assert(lp_check_value(bld->type, x));
3352 
3353    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3354 
3355    res = LLVMBuildLShr(builder, x,
3356                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3357    res = LLVMBuildAnd(builder, res,
3358                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3359    res = LLVMBuildSub(builder, res,
3360                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3361 
3362    return res;
3363 }
3364 
3365 
3366 /**
3367  * Extract the mantissa of the a floating.
3368  *
3369  * Result is a floating point value with
3370  *
3371  *   x / floor(log2(x))
3372  */
3373 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3374 lp_build_extract_mantissa(struct lp_build_context *bld,
3375                           LLVMValueRef x)
3376 {
3377    LLVMBuilderRef builder = bld->gallivm->builder;
3378    const struct lp_type type = bld->type;
3379    unsigned mantissa = lp_mantissa(type);
3380    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3381                                                   (1ULL << mantissa) - 1);
3382    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3383    LLVMValueRef res;
3384 
3385    assert(lp_check_value(bld->type, x));
3386 
3387    assert(type.floating);
3388 
3389    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3390 
3391    /* res = x / 2**ipart */
3392    res = LLVMBuildAnd(builder, x, mantmask, "");
3393    res = LLVMBuildOr(builder, res, one, "");
3394    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3395 
3396    return res;
3397 }
3398 
3399 
3400 
3401 /**
3402  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3403  * These coefficients can be generate with
3404  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3405  */
3406 const double lp_build_log2_polynomial[] = {
3407 #if LOG_POLY_DEGREE == 5
3408    2.88539008148777786488L,
3409    0.961796878841293367824L,
3410    0.577058946784739859012L,
3411    0.412914355135828735411L,
3412    0.308591899232910175289L,
3413    0.352376952300281371868L,
3414 #elif LOG_POLY_DEGREE == 4
3415    2.88539009343309178325L,
3416    0.961791550404184197881L,
3417    0.577440339438736392009L,
3418    0.403343858251329912514L,
3419    0.406718052498846252698L,
3420 #elif LOG_POLY_DEGREE == 3
3421    2.88538959748872753838L,
3422    0.961932915889597772928L,
3423    0.571118517972136195241L,
3424    0.493997535084709500285L,
3425 #else
3426 #error
3427 #endif
3428 };
3429 
3430 /**
3431  * See http://www.devmaster.net/forums/showthread.php?p=43580
3432  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3433  * http://www.nezumi.demon.co.uk/consult/logx.htm
3434  *
3435  * If handle_edge_cases is true the function will perform computations
3436  * to match the required D3D10+ behavior for each of the edge cases.
3437  * That means that if input is:
3438  * - less than zero (to and including -inf) then NaN will be returned
3439  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3440  * - +infinity, then +infinity will be returned
3441  * - NaN, then NaN will be returned
3442  *
3443  * Those checks are fairly expensive so if you don't need them make sure
3444  * handle_edge_cases is false.
3445  */
3446 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3447 lp_build_log2_approx(struct lp_build_context *bld,
3448                      LLVMValueRef x,
3449                      LLVMValueRef *p_exp,
3450                      LLVMValueRef *p_floor_log2,
3451                      LLVMValueRef *p_log2,
3452                      boolean handle_edge_cases)
3453 {
3454    LLVMBuilderRef builder = bld->gallivm->builder;
3455    const struct lp_type type = bld->type;
3456    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3457    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3458 
3459    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3460    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3461    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3462 
3463    LLVMValueRef i = NULL;
3464    LLVMValueRef y = NULL;
3465    LLVMValueRef z = NULL;
3466    LLVMValueRef exp = NULL;
3467    LLVMValueRef mant = NULL;
3468    LLVMValueRef logexp = NULL;
3469    LLVMValueRef p_z = NULL;
3470    LLVMValueRef res = NULL;
3471 
3472    assert(lp_check_value(bld->type, x));
3473 
3474    if(p_exp || p_floor_log2 || p_log2) {
3475       /* TODO: optimize the constant case */
3476       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3477           LLVMIsConstant(x)) {
3478          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3479                       __FUNCTION__);
3480       }
3481 
3482       assert(type.floating && type.width == 32);
3483 
3484       /*
3485        * We don't explicitly handle denormalized numbers. They will yield a
3486        * result in the neighbourhood of -127, which appears to be adequate
3487        * enough.
3488        */
3489 
3490       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3491 
3492       /* exp = (float) exponent(x) */
3493       exp = LLVMBuildAnd(builder, i, expmask, "");
3494    }
3495 
3496    if(p_floor_log2 || p_log2) {
3497       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3498       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3499       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3500    }
3501 
3502    if (p_log2) {
3503       /* mant = 1 + (float) mantissa(x) */
3504       mant = LLVMBuildAnd(builder, i, mantmask, "");
3505       mant = LLVMBuildOr(builder, mant, one, "");
3506       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3507 
3508       /* y = (mant - 1) / (mant + 1) */
3509       y = lp_build_div(bld,
3510          lp_build_sub(bld, mant, bld->one),
3511          lp_build_add(bld, mant, bld->one)
3512       );
3513 
3514       /* z = y^2 */
3515       z = lp_build_mul(bld, y, y);
3516 
3517       /* compute P(z) */
3518       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3519                                 ARRAY_SIZE(lp_build_log2_polynomial));
3520 
3521       /* y * P(z) + logexp */
3522       res = lp_build_mad(bld, y, p_z, logexp);
3523 
3524       if (type.floating && handle_edge_cases) {
3525          LLVMValueRef negmask, infmask,  zmask;
3526          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3527                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3528          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3529                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3530          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3531                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3532 
3533          /* If x is qual to inf make sure we return inf */
3534          res = lp_build_select(bld, infmask,
3535                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3536                                res);
3537          /* If x is qual to 0, return -inf */
3538          res = lp_build_select(bld, zmask,
3539                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3540                                res);
3541          /* If x is nan or less than 0, return nan */
3542          res = lp_build_select(bld, negmask,
3543                                lp_build_const_vec(bld->gallivm, type,  NAN),
3544                                res);
3545       }
3546    }
3547 
3548    if (p_exp) {
3549       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3550       *p_exp = exp;
3551    }
3552 
3553    if (p_floor_log2)
3554       *p_floor_log2 = logexp;
3555 
3556    if (p_log2)
3557       *p_log2 = res;
3558 }
3559 
3560 
3561 /*
3562  * log2 implementation which doesn't have special code to
3563  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3564  * the results for those cases are undefined.
3565  */
3566 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3567 lp_build_log2(struct lp_build_context *bld,
3568               LLVMValueRef x)
3569 {
3570    LLVMValueRef res;
3571    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3572    return res;
3573 }
3574 
3575 /*
3576  * Version of log2 which handles all edge cases.
3577  * Look at documentation of lp_build_log2_approx for
3578  * description of the behavior for each of the edge cases.
3579  */
3580 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3581 lp_build_log2_safe(struct lp_build_context *bld,
3582                    LLVMValueRef x)
3583 {
3584    LLVMValueRef res;
3585    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3586    return res;
3587 }
3588 
3589 
3590 /**
3591  * Faster (and less accurate) log2.
3592  *
3593  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3594  *
3595  * Piece-wise linear approximation, with exact results when x is a
3596  * power of two.
3597  *
3598  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3599  */
3600 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3601 lp_build_fast_log2(struct lp_build_context *bld,
3602                    LLVMValueRef x)
3603 {
3604    LLVMBuilderRef builder = bld->gallivm->builder;
3605    LLVMValueRef ipart;
3606    LLVMValueRef fpart;
3607 
3608    assert(lp_check_value(bld->type, x));
3609 
3610    assert(bld->type.floating);
3611 
3612    /* ipart = floor(log2(x)) - 1 */
3613    ipart = lp_build_extract_exponent(bld, x, -1);
3614    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3615 
3616    /* fpart = x / 2**ipart */
3617    fpart = lp_build_extract_mantissa(bld, x);
3618 
3619    /* ipart + fpart */
3620    return LLVMBuildFAdd(builder, ipart, fpart, "");
3621 }
3622 
3623 
3624 /**
3625  * Fast implementation of iround(log2(x)).
3626  *
3627  * Not an approximation -- it should give accurate results all the time.
3628  */
3629 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3630 lp_build_ilog2(struct lp_build_context *bld,
3631                LLVMValueRef x)
3632 {
3633    LLVMBuilderRef builder = bld->gallivm->builder;
3634    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3635    LLVMValueRef ipart;
3636 
3637    assert(bld->type.floating);
3638 
3639    assert(lp_check_value(bld->type, x));
3640 
3641    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3642    x = LLVMBuildFMul(builder, x, sqrt2, "");
3643 
3644    /* ipart = floor(log2(x) + 0.5)  */
3645    ipart = lp_build_extract_exponent(bld, x, 0);
3646 
3647    return ipart;
3648 }
3649 
3650 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3651 lp_build_mod(struct lp_build_context *bld,
3652              LLVMValueRef x,
3653              LLVMValueRef y)
3654 {
3655    LLVMBuilderRef builder = bld->gallivm->builder;
3656    LLVMValueRef res;
3657    const struct lp_type type = bld->type;
3658 
3659    assert(lp_check_value(type, x));
3660    assert(lp_check_value(type, y));
3661 
3662    if (type.floating)
3663       res = LLVMBuildFRem(builder, x, y, "");
3664    else if (type.sign)
3665       res = LLVMBuildSRem(builder, x, y, "");
3666    else
3667       res = LLVMBuildURem(builder, x, y, "");
3668    return res;
3669 }
3670 
3671 
3672 /*
3673  * For floating inputs it creates and returns a mask
3674  * which is all 1's for channels which are NaN.
3675  * Channels inside x which are not NaN will be 0.
3676  */
3677 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3678 lp_build_isnan(struct lp_build_context *bld,
3679                LLVMValueRef x)
3680 {
3681    LLVMValueRef mask;
3682    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3683 
3684    assert(bld->type.floating);
3685    assert(lp_check_value(bld->type, x));
3686 
3687    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3688                         "isnotnan");
3689    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3690    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3691    return mask;
3692 }
3693 
3694 /* Returns all 1's for floating point numbers that are
3695  * finite numbers and returns all zeros for -inf,
3696  * inf and nan's */
3697 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3698 lp_build_isfinite(struct lp_build_context *bld,
3699                   LLVMValueRef x)
3700 {
3701    LLVMBuilderRef builder = bld->gallivm->builder;
3702    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3703    struct lp_type int_type = lp_int_type(bld->type);
3704    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3705    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3706                                                     0x7f800000);
3707 
3708    if (!bld->type.floating) {
3709       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3710    }
3711    assert(bld->type.floating);
3712    assert(lp_check_value(bld->type, x));
3713    assert(bld->type.width == 32);
3714 
3715    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3716    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3717                            intx, infornan32);
3718 }
3719 
3720 /*
3721  * Returns true if the number is nan or inf and false otherwise.
3722  * The input has to be a floating point vector.
3723  */
3724 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3725 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3726                        const struct lp_type type,
3727                        LLVMValueRef x)
3728 {
3729    LLVMBuilderRef builder = gallivm->builder;
3730    struct lp_type int_type = lp_int_type(type);
3731    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3732                                                 0x7f800000);
3733    LLVMValueRef ret;
3734 
3735    assert(type.floating);
3736 
3737    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3738    ret = LLVMBuildAnd(builder, ret, const0, "");
3739    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3740                           ret, const0);
3741 
3742    return ret;
3743 }
3744 
3745 
3746 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3747 lp_build_fpstate_get(struct gallivm_state *gallivm)
3748 {
3749    if (util_cpu_caps.has_sse) {
3750       LLVMBuilderRef builder = gallivm->builder;
3751       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3752          gallivm,
3753          LLVMInt32TypeInContext(gallivm->context),
3754          "mxcsr_ptr");
3755       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3756           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3757       lp_build_intrinsic(builder,
3758                          "llvm.x86.sse.stmxcsr",
3759                          LLVMVoidTypeInContext(gallivm->context),
3760                          &mxcsr_ptr8, 1, 0);
3761       return mxcsr_ptr;
3762    }
3763    return 0;
3764 }
3765 
3766 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3767 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3768                                   boolean zero)
3769 {
3770    if (util_cpu_caps.has_sse) {
3771       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3772       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3773 
3774       LLVMBuilderRef builder = gallivm->builder;
3775       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3776       LLVMValueRef mxcsr =
3777          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3778 
3779       if (util_cpu_caps.has_daz) {
3780          /* Enable denormals are zero mode */
3781          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3782       }
3783       if (zero) {
3784          mxcsr = LLVMBuildOr(builder, mxcsr,
3785                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3786       } else {
3787          mxcsr = LLVMBuildAnd(builder, mxcsr,
3788                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3789       }
3790 
3791       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3792       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3793    }
3794 }
3795 
3796 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3797 lp_build_fpstate_set(struct gallivm_state *gallivm,
3798                      LLVMValueRef mxcsr_ptr)
3799 {
3800    if (util_cpu_caps.has_sse) {
3801       LLVMBuilderRef builder = gallivm->builder;
3802       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3803                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3804       lp_build_intrinsic(builder,
3805                          "llvm.x86.sse.ldmxcsr",
3806                          LLVMVoidTypeInContext(gallivm->context),
3807                          &mxcsr_ptr, 1, 0);
3808    }
3809 }
3810