1 /**************************************************************************
2  *
3  * Copyright 2009-2010 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Helper
32  *
33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
34  * notably min/max and saturated operations), and it is often necessary to
35  * resort machine-specific intrinsics directly. The functions here hide all
36  * these implementation details from the other modules.
37  *
38  * We also do simple expressions simplification here. Reasons are:
39  * - it is very easy given we have all necessary information readily available
40  * - LLVM optimization passes fail to simplify several vector expressions
41  * - We often know value constraints which the optimization passes have no way
42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
43  *
44  * @author Jose Fonseca <jfonseca@vmware.com>
45  */
46 
47 
48 #include <float.h>
49 
50 #include <llvm/Config/llvm-config.h>
51 
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56 
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67 
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71 
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75 
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79 
80 #define EXP_POLY_DEGREE 5
81 
82 #define LOG_POLY_DEGREE 4
83 
84 
85 /**
86  * Generate min(a, b)
87  * No checks for special case values of a or b = 1 or 0 are done.
88  * NaN's are handled according to the behavior specified by the
89  * nan_behavior argument.
90  */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93                     LLVMValueRef a,
94                     LLVMValueRef b,
95                     enum gallivm_nan_behavior nan_behavior)
96 {
97    const struct lp_type type = bld->type;
98    const char *intrinsic = NULL;
99    unsigned intr_size = 0;
100    LLVMValueRef cond;
101 
102    assert(lp_check_value(type, a));
103    assert(lp_check_value(type, b));
104 
105    /* TODO: optimize the constant case */
106 
107    if (type.floating && util_cpu_caps.has_sse) {
108       if (type.width == 32) {
109          if (type.length == 1) {
110             intrinsic = "llvm.x86.sse.min.ss";
111             intr_size = 128;
112          }
113          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
114             intrinsic = "llvm.x86.sse.min.ps";
115             intr_size = 128;
116          }
117          else {
118             intrinsic = "llvm.x86.avx.min.ps.256";
119             intr_size = 256;
120          }
121       }
122       if (type.width == 64 && util_cpu_caps.has_sse2) {
123          if (type.length == 1) {
124             intrinsic = "llvm.x86.sse2.min.sd";
125             intr_size = 128;
126          }
127          else if (type.length == 2 || !util_cpu_caps.has_avx) {
128             intrinsic = "llvm.x86.sse2.min.pd";
129             intr_size = 128;
130          }
131          else {
132             intrinsic = "llvm.x86.avx.min.pd.256";
133             intr_size = 256;
134          }
135       }
136    }
137    else if (type.floating && util_cpu_caps.has_altivec) {
138       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
139           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
140          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
141                       __FUNCTION__);
142       }
143       if (type.width == 32 && type.length == 4) {
144          intrinsic = "llvm.ppc.altivec.vminfp";
145          intr_size = 128;
146       }
147    } else if (util_cpu_caps.has_altivec) {
148       intr_size = 128;
149       if (type.width == 8) {
150          if (!type.sign) {
151             intrinsic = "llvm.ppc.altivec.vminub";
152          } else {
153             intrinsic = "llvm.ppc.altivec.vminsb";
154          }
155       } else if (type.width == 16) {
156          if (!type.sign) {
157             intrinsic = "llvm.ppc.altivec.vminuh";
158          } else {
159             intrinsic = "llvm.ppc.altivec.vminsh";
160          }
161       } else if (type.width == 32) {
162          if (!type.sign) {
163             intrinsic = "llvm.ppc.altivec.vminuw";
164          } else {
165             intrinsic = "llvm.ppc.altivec.vminsw";
166          }
167       }
168    }
169 
170    if (intrinsic) {
171       /* We need to handle nan's for floating point numbers. If one of the
172        * inputs is nan the other should be returned (required by both D3D10+
173        * and OpenCL).
174        * The sse intrinsics return the second operator in case of nan by
175        * default so we need to special code to handle those.
176        */
177       if (util_cpu_caps.has_sse && type.floating &&
178           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
179           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
180           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
181          LLVMValueRef isnan, min;
182          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
183                                                    type,
184                                                    intr_size, a, b);
185          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
186             isnan = lp_build_isnan(bld, b);
187             return lp_build_select(bld, isnan, a, min);
188          } else {
189             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
190             isnan = lp_build_isnan(bld, a);
191             return lp_build_select(bld, isnan, a, min);
192          }
193       } else {
194          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
195                                                     type,
196                                                     intr_size, a, b);
197       }
198    }
199 
200    if (type.floating) {
201       switch (nan_behavior) {
202       case GALLIVM_NAN_RETURN_NAN: {
203          LLVMValueRef isnan = lp_build_isnan(bld, b);
204          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
205          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
206          return lp_build_select(bld, cond, a, b);
207       }
208          break;
209       case GALLIVM_NAN_RETURN_OTHER: {
210          LLVMValueRef isnan = lp_build_isnan(bld, a);
211          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
212          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
213          return lp_build_select(bld, cond, a, b);
214       }
215          break;
216       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
217          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
218          return lp_build_select(bld, cond, a, b);
219       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
220          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
221          return lp_build_select(bld, cond, b, a);
222       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
223          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
224          return lp_build_select(bld, cond, a, b);
225          break;
226       default:
227          assert(0);
228          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
229          return lp_build_select(bld, cond, a, b);
230       }
231    } else {
232       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
233       return lp_build_select(bld, cond, a, b);
234    }
235 }
236 
237 
238 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)239 lp_build_fmuladd(LLVMBuilderRef builder,
240                  LLVMValueRef a,
241                  LLVMValueRef b,
242                  LLVMValueRef c)
243 {
244    LLVMTypeRef type = LLVMTypeOf(a);
245    assert(type == LLVMTypeOf(b));
246    assert(type == LLVMTypeOf(c));
247 
248    char intrinsic[32];
249    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
250    LLVMValueRef args[] = { a, b, c };
251    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
252 }
253 
254 
255 /**
256  * Generate max(a, b)
257  * No checks for special case values of a or b = 1 or 0 are done.
258  * NaN's are handled according to the behavior specified by the
259  * nan_behavior argument.
260  */
261 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)262 lp_build_max_simple(struct lp_build_context *bld,
263                     LLVMValueRef a,
264                     LLVMValueRef b,
265                     enum gallivm_nan_behavior nan_behavior)
266 {
267    const struct lp_type type = bld->type;
268    const char *intrinsic = NULL;
269    unsigned intr_size = 0;
270    LLVMValueRef cond;
271 
272    assert(lp_check_value(type, a));
273    assert(lp_check_value(type, b));
274 
275    /* TODO: optimize the constant case */
276 
277    if (type.floating && util_cpu_caps.has_sse) {
278       if (type.width == 32) {
279          if (type.length == 1) {
280             intrinsic = "llvm.x86.sse.max.ss";
281             intr_size = 128;
282          }
283          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
284             intrinsic = "llvm.x86.sse.max.ps";
285             intr_size = 128;
286          }
287          else {
288             intrinsic = "llvm.x86.avx.max.ps.256";
289             intr_size = 256;
290          }
291       }
292       if (type.width == 64 && util_cpu_caps.has_sse2) {
293          if (type.length == 1) {
294             intrinsic = "llvm.x86.sse2.max.sd";
295             intr_size = 128;
296          }
297          else if (type.length == 2 || !util_cpu_caps.has_avx) {
298             intrinsic = "llvm.x86.sse2.max.pd";
299             intr_size = 128;
300          }
301          else {
302             intrinsic = "llvm.x86.avx.max.pd.256";
303             intr_size = 256;
304          }
305       }
306    }
307    else if (type.floating && util_cpu_caps.has_altivec) {
308       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
309           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
310          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
311                       __FUNCTION__);
312       }
313       if (type.width == 32 || type.length == 4) {
314          intrinsic = "llvm.ppc.altivec.vmaxfp";
315          intr_size = 128;
316       }
317    } else if (util_cpu_caps.has_altivec) {
318      intr_size = 128;
319      if (type.width == 8) {
320        if (!type.sign) {
321          intrinsic = "llvm.ppc.altivec.vmaxub";
322        } else {
323          intrinsic = "llvm.ppc.altivec.vmaxsb";
324        }
325      } else if (type.width == 16) {
326        if (!type.sign) {
327          intrinsic = "llvm.ppc.altivec.vmaxuh";
328        } else {
329          intrinsic = "llvm.ppc.altivec.vmaxsh";
330        }
331      } else if (type.width == 32) {
332        if (!type.sign) {
333          intrinsic = "llvm.ppc.altivec.vmaxuw";
334        } else {
335          intrinsic = "llvm.ppc.altivec.vmaxsw";
336        }
337      }
338    }
339 
340    if (intrinsic) {
341       if (util_cpu_caps.has_sse && type.floating &&
342           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
343           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
344           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
345          LLVMValueRef isnan, max;
346          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
347                                                    type,
348                                                    intr_size, a, b);
349          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
350             isnan = lp_build_isnan(bld, b);
351             return lp_build_select(bld, isnan, a, max);
352          } else {
353             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
354             isnan = lp_build_isnan(bld, a);
355             return lp_build_select(bld, isnan, a, max);
356          }
357       } else {
358          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
359                                                     type,
360                                                     intr_size, a, b);
361       }
362    }
363 
364    if (type.floating) {
365       switch (nan_behavior) {
366       case GALLIVM_NAN_RETURN_NAN: {
367          LLVMValueRef isnan = lp_build_isnan(bld, b);
368          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
369          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
370          return lp_build_select(bld, cond, a, b);
371       }
372          break;
373       case GALLIVM_NAN_RETURN_OTHER: {
374          LLVMValueRef isnan = lp_build_isnan(bld, a);
375          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
376          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
377          return lp_build_select(bld, cond, a, b);
378       }
379          break;
380       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
381          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
382          return lp_build_select(bld, cond, a, b);
383       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
384          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
385          return lp_build_select(bld, cond, b, a);
386       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
387          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
388          return lp_build_select(bld, cond, a, b);
389          break;
390       default:
391          assert(0);
392          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
393          return lp_build_select(bld, cond, a, b);
394       }
395    } else {
396       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
397       return lp_build_select(bld, cond, a, b);
398    }
399 }
400 
401 
402 /**
403  * Generate 1 - a, or ~a depending on bld->type.
404  */
405 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)406 lp_build_comp(struct lp_build_context *bld,
407               LLVMValueRef a)
408 {
409    LLVMBuilderRef builder = bld->gallivm->builder;
410    const struct lp_type type = bld->type;
411 
412    assert(lp_check_value(type, a));
413 
414    if(a == bld->one)
415       return bld->zero;
416    if(a == bld->zero)
417       return bld->one;
418 
419    if(type.norm && !type.floating && !type.fixed && !type.sign) {
420       if(LLVMIsConstant(a))
421          return LLVMConstNot(a);
422       else
423          return LLVMBuildNot(builder, a, "");
424    }
425 
426    if(LLVMIsConstant(a))
427       if (type.floating)
428           return LLVMConstFSub(bld->one, a);
429       else
430           return LLVMConstSub(bld->one, a);
431    else
432       if (type.floating)
433          return LLVMBuildFSub(builder, bld->one, a, "");
434       else
435          return LLVMBuildSub(builder, bld->one, a, "");
436 }
437 
438 
439 /**
440  * Generate a + b
441  */
442 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)443 lp_build_add(struct lp_build_context *bld,
444              LLVMValueRef a,
445              LLVMValueRef b)
446 {
447    LLVMBuilderRef builder = bld->gallivm->builder;
448    const struct lp_type type = bld->type;
449    LLVMValueRef res;
450 
451    assert(lp_check_value(type, a));
452    assert(lp_check_value(type, b));
453 
454    if (a == bld->zero)
455       return b;
456    if (b == bld->zero)
457       return a;
458    if (a == bld->undef || b == bld->undef)
459       return bld->undef;
460 
461    if (type.norm) {
462       const char *intrinsic = NULL;
463 
464       if (!type.sign && (a == bld->one || b == bld->one))
465         return bld->one;
466 
467       if (!type.floating && !type.fixed) {
468          if (LLVM_VERSION_MAJOR >= 8) {
469             char intrin[32];
470             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
471             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
472             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
473          }
474          if (type.width * type.length == 128) {
475             if (util_cpu_caps.has_sse2) {
476                if (type.width == 8)
477                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
478                if (type.width == 16)
479                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
480             } else if (util_cpu_caps.has_altivec) {
481                if (type.width == 8)
482                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
483                if (type.width == 16)
484                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
485             }
486          }
487          if (type.width * type.length == 256) {
488             if (util_cpu_caps.has_avx2) {
489                if (type.width == 8)
490                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
491                if (type.width == 16)
492                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
493             }
494          }
495       }
496 
497       if (intrinsic)
498          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
499    }
500 
501    if(type.norm && !type.floating && !type.fixed) {
502       if (type.sign) {
503          uint64_t sign = (uint64_t)1 << (type.width - 1);
504          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
505          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
506          /* a_clamp_max is the maximum a for positive b,
507             a_clamp_min is the minimum a for negative b. */
508          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
509          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
510          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
511       }
512    }
513 
514    if(LLVMIsConstant(a) && LLVMIsConstant(b))
515       if (type.floating)
516          res = LLVMConstFAdd(a, b);
517       else
518          res = LLVMConstAdd(a, b);
519    else
520       if (type.floating)
521          res = LLVMBuildFAdd(builder, a, b, "");
522       else
523          res = LLVMBuildAdd(builder, a, b, "");
524 
525    /* clamp to ceiling of 1.0 */
526    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
527       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
528 
529    if (type.norm && !type.floating && !type.fixed) {
530       if (!type.sign) {
531          /*
532           * newer llvm versions no longer support the intrinsics, but recognize
533           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
534           * code, it is important we match the pattern llvm uses (and pray llvm
535           * doesn't change it - and hope they decide on the same pattern for
536           * all backends supporting it...).
537           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
538           * interfere with llvm's ability to recognize the pattern but seems
539           * a bit brittle.
540           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
541           */
542          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
543          res = lp_build_select(bld, overflowed,
544                                LLVMConstAllOnes(bld->int_vec_type), res);
545       }
546    }
547 
548    /* XXX clamp to floor of -1 or 0??? */
549 
550    return res;
551 }
552 
553 
554 /** Return the scalar sum of the elements of a.
555  * Should avoid this operation whenever possible.
556  */
557 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)558 lp_build_horizontal_add(struct lp_build_context *bld,
559                         LLVMValueRef a)
560 {
561    LLVMBuilderRef builder = bld->gallivm->builder;
562    const struct lp_type type = bld->type;
563    LLVMValueRef index, res;
564    unsigned i, length;
565    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
566    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
567    LLVMValueRef vecres, elem2;
568 
569    assert(lp_check_value(type, a));
570 
571    if (type.length == 1) {
572       return a;
573    }
574 
575    assert(!bld->type.norm);
576 
577    /*
578     * for byte vectors can do much better with psadbw.
579     * Using repeated shuffle/adds here. Note with multiple vectors
580     * this can be done more efficiently as outlined in the intel
581     * optimization manual.
582     * Note: could cause data rearrangement if used with smaller element
583     * sizes.
584     */
585 
586    vecres = a;
587    length = type.length / 2;
588    while (length > 1) {
589       LLVMValueRef vec1, vec2;
590       for (i = 0; i < length; i++) {
591          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
592          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
593       }
594       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
595                                     LLVMConstVector(shuffles1, length), "");
596       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
597                                     LLVMConstVector(shuffles2, length), "");
598       if (type.floating) {
599          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
600       }
601       else {
602          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
603       }
604       length = length >> 1;
605    }
606 
607    /* always have vector of size 2 here */
608    assert(length == 1);
609 
610    index = lp_build_const_int32(bld->gallivm, 0);
611    res = LLVMBuildExtractElement(builder, vecres, index, "");
612    index = lp_build_const_int32(bld->gallivm, 1);
613    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
614 
615    if (type.floating)
616       res = LLVMBuildFAdd(builder, res, elem2, "");
617     else
618       res = LLVMBuildAdd(builder, res, elem2, "");
619 
620    return res;
621 }
622 
623 /**
624  * Return the horizontal sums of 4 float vectors as a float4 vector.
625  * This uses the technique as outlined in Intel Optimization Manual.
626  */
627 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])628 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
629                             LLVMValueRef src[4])
630 {
631    struct gallivm_state *gallivm = bld->gallivm;
632    LLVMBuilderRef builder = gallivm->builder;
633    LLVMValueRef shuffles[4];
634    LLVMValueRef tmp[4];
635    LLVMValueRef sumtmp[2], shuftmp[2];
636 
637    /* lower half of regs */
638    shuffles[0] = lp_build_const_int32(gallivm, 0);
639    shuffles[1] = lp_build_const_int32(gallivm, 1);
640    shuffles[2] = lp_build_const_int32(gallivm, 4);
641    shuffles[3] = lp_build_const_int32(gallivm, 5);
642    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
643                                    LLVMConstVector(shuffles, 4), "");
644    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
645                                    LLVMConstVector(shuffles, 4), "");
646 
647    /* upper half of regs */
648    shuffles[0] = lp_build_const_int32(gallivm, 2);
649    shuffles[1] = lp_build_const_int32(gallivm, 3);
650    shuffles[2] = lp_build_const_int32(gallivm, 6);
651    shuffles[3] = lp_build_const_int32(gallivm, 7);
652    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
653                                    LLVMConstVector(shuffles, 4), "");
654    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
655                                    LLVMConstVector(shuffles, 4), "");
656 
657    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
658    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
659 
660    shuffles[0] = lp_build_const_int32(gallivm, 0);
661    shuffles[1] = lp_build_const_int32(gallivm, 2);
662    shuffles[2] = lp_build_const_int32(gallivm, 4);
663    shuffles[3] = lp_build_const_int32(gallivm, 6);
664    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
665                                        LLVMConstVector(shuffles, 4), "");
666 
667    shuffles[0] = lp_build_const_int32(gallivm, 1);
668    shuffles[1] = lp_build_const_int32(gallivm, 3);
669    shuffles[2] = lp_build_const_int32(gallivm, 5);
670    shuffles[3] = lp_build_const_int32(gallivm, 7);
671    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
672                                        LLVMConstVector(shuffles, 4), "");
673 
674    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
675 }
676 
677 
678 /*
679  * partially horizontally add 2-4 float vectors with length nx4,
680  * i.e. only four adjacent values in each vector will be added,
681  * assuming values are really grouped in 4 which also determines
682  * output order.
683  *
684  * Return a vector of the same length as the initial vectors,
685  * with the excess elements (if any) being undefined.
686  * The element order is independent of number of input vectors.
687  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
688  * the output order thus will be
689  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
690  */
691 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)692 lp_build_hadd_partial4(struct lp_build_context *bld,
693                        LLVMValueRef vectors[],
694                        unsigned num_vecs)
695 {
696    struct gallivm_state *gallivm = bld->gallivm;
697    LLVMBuilderRef builder = gallivm->builder;
698    LLVMValueRef ret_vec;
699    LLVMValueRef tmp[4];
700    const char *intrinsic = NULL;
701 
702    assert(num_vecs >= 2 && num_vecs <= 4);
703    assert(bld->type.floating);
704 
705    /* only use this with at least 2 vectors, as it is sort of expensive
706     * (depending on cpu) and we always need two horizontal adds anyway,
707     * so a shuffle/add approach might be better.
708     */
709 
710    tmp[0] = vectors[0];
711    tmp[1] = vectors[1];
712 
713    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
714    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
715 
716    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
717        bld->type.length == 4) {
718       intrinsic = "llvm.x86.sse3.hadd.ps";
719    }
720    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
721             bld->type.length == 8) {
722       intrinsic = "llvm.x86.avx.hadd.ps.256";
723    }
724    if (intrinsic) {
725       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
726                                        lp_build_vec_type(gallivm, bld->type),
727                                        tmp[0], tmp[1]);
728       if (num_vecs > 2) {
729          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
730                                           lp_build_vec_type(gallivm, bld->type),
731                                           tmp[2], tmp[3]);
732       }
733       else {
734          tmp[1] = tmp[0];
735       }
736       return lp_build_intrinsic_binary(builder, intrinsic,
737                                        lp_build_vec_type(gallivm, bld->type),
738                                        tmp[0], tmp[1]);
739    }
740 
741    if (bld->type.length == 4) {
742       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
743    }
744    else {
745       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
746       unsigned j;
747       unsigned num_iter = bld->type.length / 4;
748       struct lp_type parttype = bld->type;
749       parttype.length = 4;
750       for (j = 0; j < num_iter; j++) {
751          LLVMValueRef partsrc[4];
752          unsigned i;
753          for (i = 0; i < 4; i++) {
754             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
755          }
756          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
757       }
758       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
759    }
760    return ret_vec;
761 }
762 
763 /**
764  * Generate a - b
765  */
766 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)767 lp_build_sub(struct lp_build_context *bld,
768              LLVMValueRef a,
769              LLVMValueRef b)
770 {
771    LLVMBuilderRef builder = bld->gallivm->builder;
772    const struct lp_type type = bld->type;
773    LLVMValueRef res;
774 
775    assert(lp_check_value(type, a));
776    assert(lp_check_value(type, b));
777 
778    if (b == bld->zero)
779       return a;
780    if (a == bld->undef || b == bld->undef)
781       return bld->undef;
782    if (a == b)
783       return bld->zero;
784 
785    if (type.norm) {
786       const char *intrinsic = NULL;
787 
788       if (!type.sign && b == bld->one)
789         return bld->zero;
790 
791       if (!type.floating && !type.fixed) {
792          if (LLVM_VERSION_MAJOR >= 8) {
793             char intrin[32];
794             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
795             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
796             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
797          }
798          if (type.width * type.length == 128) {
799             if (util_cpu_caps.has_sse2) {
800                if (type.width == 8)
801                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
802                if (type.width == 16)
803                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
804             } else if (util_cpu_caps.has_altivec) {
805                if (type.width == 8)
806                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
807                if (type.width == 16)
808                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
809             }
810          }
811          if (type.width * type.length == 256) {
812             if (util_cpu_caps.has_avx2) {
813                if (type.width == 8)
814                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
815                if (type.width == 16)
816                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
817             }
818          }
819       }
820 
821       if (intrinsic)
822          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
823    }
824 
825    if(type.norm && !type.floating && !type.fixed) {
826       if (type.sign) {
827          uint64_t sign = (uint64_t)1 << (type.width - 1);
828          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
829          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
830          /* a_clamp_max is the maximum a for negative b,
831             a_clamp_min is the minimum a for positive b. */
832          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
833          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
834          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
835       } else {
836          /*
837           * This must match llvm pattern for saturated unsigned sub.
838           * (lp_build_max_simple actually does the job with its current
839           * definition but do it explicitly here.)
840           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
841           * interfere with llvm's ability to recognize the pattern but seems
842           * a bit brittle.
843           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
844           */
845          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
846          a = lp_build_select(bld, no_ov, a, b);
847       }
848    }
849 
850    if(LLVMIsConstant(a) && LLVMIsConstant(b))
851       if (type.floating)
852          res = LLVMConstFSub(a, b);
853       else
854          res = LLVMConstSub(a, b);
855    else
856       if (type.floating)
857          res = LLVMBuildFSub(builder, a, b, "");
858       else
859          res = LLVMBuildSub(builder, a, b, "");
860 
861    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
862       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
863 
864    return res;
865 }
866 
867 
868 
869 /**
870  * Normalized multiplication.
871  *
872  * There are several approaches for (using 8-bit normalized multiplication as
873  * an example):
874  *
875  * - alpha plus one
876  *
877  *     makes the following approximation to the division (Sree)
878  *
879  *       a*b/255 ~= (a*(b + 1)) >> 256
880  *
881  *     which is the fastest method that satisfies the following OpenGL criteria of
882  *
883  *       0*0 = 0 and 255*255 = 255
884  *
885  * - geometric series
886  *
887  *     takes the geometric series approximation to the division
888  *
889  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
890  *
891  *     in this case just the first two terms to fit in 16bit arithmetic
892  *
893  *       t/255 ~= (t + (t >> 8)) >> 8
894  *
895  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
896  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
897  *     must be used.
898  *
899  * - geometric series plus rounding
900  *
901  *     when using a geometric series division instead of truncating the result
902  *     use roundoff in the approximation (Jim Blinn)
903  *
904  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
905  *
906  *     achieving the exact results.
907  *
908  *
909  *
910  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
911  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
912  * @sa Michael Herf, The "double blend trick", May 2000,
913  *     http://www.stereopsis.com/doubleblend.html
914  */
915 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)916 lp_build_mul_norm(struct gallivm_state *gallivm,
917                   struct lp_type wide_type,
918                   LLVMValueRef a, LLVMValueRef b)
919 {
920    LLVMBuilderRef builder = gallivm->builder;
921    struct lp_build_context bld;
922    unsigned n;
923    LLVMValueRef half;
924    LLVMValueRef ab;
925 
926    assert(!wide_type.floating);
927    assert(lp_check_value(wide_type, a));
928    assert(lp_check_value(wide_type, b));
929 
930    lp_build_context_init(&bld, gallivm, wide_type);
931 
932    n = wide_type.width / 2;
933    if (wide_type.sign) {
934       --n;
935    }
936 
937    /*
938     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
939     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
940     */
941 
942    /*
943     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
944     */
945 
946    ab = LLVMBuildMul(builder, a, b, "");
947    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
948 
949    /*
950     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
951     */
952 
953    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
954    if (wide_type.sign) {
955       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
956       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
957       half = lp_build_select(&bld, sign, minus_half, half);
958    }
959    ab = LLVMBuildAdd(builder, ab, half, "");
960 
961    /* Final division */
962    ab = lp_build_shr_imm(&bld, ab, n);
963 
964    return ab;
965 }
966 
967 /**
968  * Generate a * b
969  */
970 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)971 lp_build_mul(struct lp_build_context *bld,
972              LLVMValueRef a,
973              LLVMValueRef b)
974 {
975    LLVMBuilderRef builder = bld->gallivm->builder;
976    const struct lp_type type = bld->type;
977    LLVMValueRef shift;
978    LLVMValueRef res;
979 
980    assert(lp_check_value(type, a));
981    assert(lp_check_value(type, b));
982 
983    if(a == bld->zero)
984       return bld->zero;
985    if(a == bld->one)
986       return b;
987    if(b == bld->zero)
988       return bld->zero;
989    if(b == bld->one)
990       return a;
991    if(a == bld->undef || b == bld->undef)
992       return bld->undef;
993 
994    if (!type.floating && !type.fixed && type.norm) {
995       struct lp_type wide_type = lp_wider_type(type);
996       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
997 
998       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
999       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1000 
1001       /* PMULLW, PSRLW, PADDW */
1002       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1003       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1004 
1005       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1006 
1007       return ab;
1008    }
1009 
1010    if(type.fixed)
1011       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1012    else
1013       shift = NULL;
1014 
1015    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1016       if (type.floating)
1017          res = LLVMConstFMul(a, b);
1018       else
1019          res = LLVMConstMul(a, b);
1020       if(shift) {
1021          if(type.sign)
1022             res = LLVMConstAShr(res, shift);
1023          else
1024             res = LLVMConstLShr(res, shift);
1025       }
1026    }
1027    else {
1028       if (type.floating)
1029          res = LLVMBuildFMul(builder, a, b, "");
1030       else
1031          res = LLVMBuildMul(builder, a, b, "");
1032       if(shift) {
1033          if(type.sign)
1034             res = LLVMBuildAShr(builder, res, shift, "");
1035          else
1036             res = LLVMBuildLShr(builder, res, shift, "");
1037       }
1038    }
1039 
1040    return res;
1041 }
1042 
1043 /*
1044  * Widening mul, valid for 32x32 bit -> 64bit only.
1045  * Result is low 32bits, high bits returned in res_hi.
1046  *
1047  * Emits code that is meant to be compiled for the host CPU.
1048  */
1049 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1050 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1051                          LLVMValueRef a,
1052                          LLVMValueRef b,
1053                          LLVMValueRef *res_hi)
1054 {
1055    struct gallivm_state *gallivm = bld->gallivm;
1056    LLVMBuilderRef builder = gallivm->builder;
1057 
1058    assert(bld->type.width == 32);
1059    assert(bld->type.floating == 0);
1060    assert(bld->type.fixed == 0);
1061    assert(bld->type.norm == 0);
1062 
1063    /*
1064     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1065     * for x86 simd is atrocious (even if the high bits weren't required),
1066     * trying to handle real 64bit inputs (which of course can't happen due
1067     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1068     * apparently llvm does not recognize this widening mul). This includes 6
1069     * (instead of 2) pmuludq plus extra adds and shifts
1070     * The same story applies to signed mul, albeit fixing this requires sse41.
1071     * https://llvm.org/bugs/show_bug.cgi?id=30845
1072     * So, whip up our own code, albeit only for length 4 and 8 (which
1073     * should be good enough)...
1074     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1075     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1076     * for signed), which the fallback code does not, without this llvm
1077     * will likely still produce atrocious code.
1078     */
1079    if (LLVM_VERSION_MAJOR < 7 &&
1080        (bld->type.length == 4 || bld->type.length == 8) &&
1081        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1082         util_cpu_caps.has_sse4_1)) {
1083       const char *intrinsic = NULL;
1084       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1085       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1086       struct lp_type type_wide = lp_wider_type(bld->type);
1087       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1088       unsigned i;
1089       for (i = 0; i < bld->type.length; i += 2) {
1090          shuf[i] = lp_build_const_int32(gallivm, i+1);
1091          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1092       }
1093       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1094       aeven = a;
1095       beven = b;
1096       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1097       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1098 
1099       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1100          if (bld->type.sign) {
1101             intrinsic = "llvm.x86.avx2.pmul.dq";
1102          } else {
1103             intrinsic = "llvm.x86.avx2.pmulu.dq";
1104          }
1105          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1106                                              wider_type, aeven, beven);
1107          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1108                                             wider_type, aodd, bodd);
1109       }
1110       else {
1111          /* for consistent naming look elsewhere... */
1112          if (bld->type.sign) {
1113             intrinsic = "llvm.x86.sse41.pmuldq";
1114          } else {
1115             intrinsic = "llvm.x86.sse2.pmulu.dq";
1116          }
1117          /*
1118           * XXX If we only have AVX but not AVX2 this is a pain.
1119           * lp_build_intrinsic_binary_anylength() can't handle it
1120           * (due to src and dst type not being identical).
1121           */
1122          if (bld->type.length == 8) {
1123             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1124             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1125             LLVMValueRef muleven2[2], mulodd2[2];
1126             struct lp_type type_wide_half = type_wide;
1127             LLVMTypeRef wtype_half;
1128             type_wide_half.length = 2;
1129             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1130             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1131             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1132             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1133             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1134             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1135             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1136             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1137             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1138             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1139                                                     wtype_half, aevenlo, bevenlo);
1140             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1141                                                    wtype_half, aoddlo, boddlo);
1142             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1143                                                     wtype_half, aevenhi, bevenhi);
1144             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1145                                                    wtype_half, aoddhi, boddhi);
1146             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1147             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1148 
1149          }
1150          else {
1151             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152                                                 wider_type, aeven, beven);
1153             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154                                                wider_type, aodd, bodd);
1155          }
1156       }
1157       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1158       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1159 
1160       for (i = 0; i < bld->type.length; i += 2) {
1161          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1162          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1163       }
1164       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1165       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1166 
1167       for (i = 0; i < bld->type.length; i += 2) {
1168          shuf[i] = lp_build_const_int32(gallivm, i);
1169          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1170       }
1171       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1172       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1173    }
1174    else {
1175       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1176    }
1177 }
1178 
1179 
1180 /*
1181  * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1182  * Result is low N bits, high bits returned in res_hi.
1183  *
1184  * Emits generic code.
1185  */
1186 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1187 lp_build_mul_32_lohi(struct lp_build_context *bld,
1188                      LLVMValueRef a,
1189                      LLVMValueRef b,
1190                      LLVMValueRef *res_hi)
1191 {
1192    struct gallivm_state *gallivm = bld->gallivm;
1193    LLVMBuilderRef builder = gallivm->builder;
1194    LLVMValueRef tmp, shift, res_lo;
1195    struct lp_type type_tmp;
1196    LLVMTypeRef wide_type, narrow_type;
1197 
1198    type_tmp = bld->type;
1199    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1200    if (bld->type.width < 32)
1201       type_tmp.width = 32;
1202    else
1203       type_tmp.width *= 2;
1204    wide_type = lp_build_vec_type(gallivm, type_tmp);
1205    shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1206 
1207    if (bld->type.sign) {
1208       a = LLVMBuildSExt(builder, a, wide_type, "");
1209       b = LLVMBuildSExt(builder, b, wide_type, "");
1210    } else {
1211       a = LLVMBuildZExt(builder, a, wide_type, "");
1212       b = LLVMBuildZExt(builder, b, wide_type, "");
1213    }
1214    tmp = LLVMBuildMul(builder, a, b, "");
1215 
1216    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1217 
1218    /* Since we truncate anyway, LShr and AShr are equivalent. */
1219    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1220    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1221 
1222    return res_lo;
1223 }
1224 
1225 
1226 /* a * b + c */
1227 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1228 lp_build_mad(struct lp_build_context *bld,
1229              LLVMValueRef a,
1230              LLVMValueRef b,
1231              LLVMValueRef c)
1232 {
1233    const struct lp_type type = bld->type;
1234    if (type.floating) {
1235       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1236    } else {
1237       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1238    }
1239 }
1240 
1241 
1242 /**
1243  * Small vector x scale multiplication optimization.
1244  */
1245 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1246 lp_build_mul_imm(struct lp_build_context *bld,
1247                  LLVMValueRef a,
1248                  int b)
1249 {
1250    LLVMBuilderRef builder = bld->gallivm->builder;
1251    LLVMValueRef factor;
1252 
1253    assert(lp_check_value(bld->type, a));
1254 
1255    if(b == 0)
1256       return bld->zero;
1257 
1258    if(b == 1)
1259       return a;
1260 
1261    if(b == -1)
1262       return lp_build_negate(bld, a);
1263 
1264    if(b == 2 && bld->type.floating)
1265       return lp_build_add(bld, a, a);
1266 
1267    if(util_is_power_of_two_or_zero(b)) {
1268       unsigned shift = ffs(b) - 1;
1269 
1270       if(bld->type.floating) {
1271 #if 0
1272          /*
1273           * Power of two multiplication by directly manipulating the exponent.
1274           *
1275           * XXX: This might not be always faster, it will introduce a small error
1276           * for multiplication by zero, and it will produce wrong results
1277           * for Inf and NaN.
1278           */
1279          unsigned mantissa = lp_mantissa(bld->type);
1280          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1281          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1282          a = LLVMBuildAdd(builder, a, factor, "");
1283          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1284          return a;
1285 #endif
1286       }
1287       else {
1288          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1289          return LLVMBuildShl(builder, a, factor, "");
1290       }
1291    }
1292 
1293    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1294    return lp_build_mul(bld, a, factor);
1295 }
1296 
1297 
1298 /**
1299  * Generate a / b
1300  */
1301 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1302 lp_build_div(struct lp_build_context *bld,
1303              LLVMValueRef a,
1304              LLVMValueRef b)
1305 {
1306    LLVMBuilderRef builder = bld->gallivm->builder;
1307    const struct lp_type type = bld->type;
1308 
1309    assert(lp_check_value(type, a));
1310    assert(lp_check_value(type, b));
1311 
1312    if(a == bld->zero)
1313       return bld->zero;
1314    if(a == bld->one && type.floating)
1315       return lp_build_rcp(bld, b);
1316    if(b == bld->zero)
1317       return bld->undef;
1318    if(b == bld->one)
1319       return a;
1320    if(a == bld->undef || b == bld->undef)
1321       return bld->undef;
1322 
1323    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1324       if (type.floating)
1325          return LLVMConstFDiv(a, b);
1326       else if (type.sign)
1327          return LLVMConstSDiv(a, b);
1328       else
1329          return LLVMConstUDiv(a, b);
1330    }
1331 
1332    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1333    if(FALSE &&
1334       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1335        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1336       type.floating)
1337       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1338 
1339    if (type.floating)
1340       return LLVMBuildFDiv(builder, a, b, "");
1341    else if (type.sign)
1342       return LLVMBuildSDiv(builder, a, b, "");
1343    else
1344       return LLVMBuildUDiv(builder, a, b, "");
1345 }
1346 
1347 
1348 /**
1349  * Linear interpolation helper.
1350  *
1351  * @param normalized whether we are interpolating normalized values,
1352  *        encoded in normalized integers, twice as wide.
1353  *
1354  * @sa http://www.stereopsis.com/doubleblend.html
1355  */
1356 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1357 lp_build_lerp_simple(struct lp_build_context *bld,
1358                      LLVMValueRef x,
1359                      LLVMValueRef v0,
1360                      LLVMValueRef v1,
1361                      unsigned flags)
1362 {
1363    unsigned half_width = bld->type.width/2;
1364    LLVMBuilderRef builder = bld->gallivm->builder;
1365    LLVMValueRef delta;
1366    LLVMValueRef res;
1367 
1368    assert(lp_check_value(bld->type, x));
1369    assert(lp_check_value(bld->type, v0));
1370    assert(lp_check_value(bld->type, v1));
1371 
1372    delta = lp_build_sub(bld, v1, v0);
1373 
1374    if (bld->type.floating) {
1375       assert(flags == 0);
1376       return lp_build_mad(bld, x, delta, v0);
1377    }
1378 
1379    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1380       if (!bld->type.sign) {
1381          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1382             /*
1383              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1384              * most-significant-bit to the lowest-significant-bit, so that
1385              * later we can just divide by 2**n instead of 2**n - 1.
1386              */
1387 
1388             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1389          }
1390 
1391          /* (x * delta) >> n */
1392          res = lp_build_mul(bld, x, delta);
1393          res = lp_build_shr_imm(bld, res, half_width);
1394       } else {
1395          /*
1396           * The rescaling trick above doesn't work for signed numbers, so
1397           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1398           * instead.
1399           */
1400          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1401          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1402       }
1403    } else {
1404       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1405       res = lp_build_mul(bld, x, delta);
1406    }
1407 
1408    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1409       /*
1410        * At this point both res and v0 only use the lower half of the bits,
1411        * the rest is zero. Instead of add / mask, do add with half wide type.
1412        */
1413       struct lp_type narrow_type;
1414       struct lp_build_context narrow_bld;
1415 
1416       memset(&narrow_type, 0, sizeof narrow_type);
1417       narrow_type.sign   = bld->type.sign;
1418       narrow_type.width  = bld->type.width/2;
1419       narrow_type.length = bld->type.length*2;
1420 
1421       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1422       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1423       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1424       res = lp_build_add(&narrow_bld, v0, res);
1425       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1426    } else {
1427       res = lp_build_add(bld, v0, res);
1428 
1429       if (bld->type.fixed) {
1430          /*
1431           * We need to mask out the high order bits when lerping 8bit
1432           * normalized colors stored on 16bits
1433           */
1434          /* XXX: This step is necessary for lerping 8bit colors stored on
1435           * 16bits, but it will be wrong for true fixed point use cases.
1436           * Basically we need a more powerful lp_type, capable of further
1437           * distinguishing the values interpretation from the value storage.
1438           */
1439          LLVMValueRef low_bits;
1440          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1441          res = LLVMBuildAnd(builder, res, low_bits, "");
1442       }
1443    }
1444 
1445    return res;
1446 }
1447 
1448 
1449 /**
1450  * Linear interpolation.
1451  */
1452 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1453 lp_build_lerp(struct lp_build_context *bld,
1454               LLVMValueRef x,
1455               LLVMValueRef v0,
1456               LLVMValueRef v1,
1457               unsigned flags)
1458 {
1459    const struct lp_type type = bld->type;
1460    LLVMValueRef res;
1461 
1462    assert(lp_check_value(type, x));
1463    assert(lp_check_value(type, v0));
1464    assert(lp_check_value(type, v1));
1465 
1466    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1467 
1468    if (type.norm) {
1469       struct lp_type wide_type;
1470       struct lp_build_context wide_bld;
1471       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1472 
1473       assert(type.length >= 2);
1474 
1475       /*
1476        * Create a wider integer type, enough to hold the
1477        * intermediate result of the multiplication.
1478        */
1479       memset(&wide_type, 0, sizeof wide_type);
1480       wide_type.sign   = type.sign;
1481       wide_type.width  = type.width*2;
1482       wide_type.length = type.length/2;
1483 
1484       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1485 
1486       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1487       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1488       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1489 
1490       /*
1491        * Lerp both halves.
1492        */
1493 
1494       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1495 
1496       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1497       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1498 
1499       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1500    } else {
1501       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1502    }
1503 
1504    return res;
1505 }
1506 
1507 
1508 /**
1509  * Bilinear interpolation.
1510  *
1511  * Values indices are in v_{yx}.
1512  */
1513 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1514 lp_build_lerp_2d(struct lp_build_context *bld,
1515                  LLVMValueRef x,
1516                  LLVMValueRef y,
1517                  LLVMValueRef v00,
1518                  LLVMValueRef v01,
1519                  LLVMValueRef v10,
1520                  LLVMValueRef v11,
1521                  unsigned flags)
1522 {
1523    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1524    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1525    return lp_build_lerp(bld, y, v0, v1, flags);
1526 }
1527 
1528 
1529 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1530 lp_build_lerp_3d(struct lp_build_context *bld,
1531                  LLVMValueRef x,
1532                  LLVMValueRef y,
1533                  LLVMValueRef z,
1534                  LLVMValueRef v000,
1535                  LLVMValueRef v001,
1536                  LLVMValueRef v010,
1537                  LLVMValueRef v011,
1538                  LLVMValueRef v100,
1539                  LLVMValueRef v101,
1540                  LLVMValueRef v110,
1541                  LLVMValueRef v111,
1542                  unsigned flags)
1543 {
1544    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1545    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1546    return lp_build_lerp(bld, z, v0, v1, flags);
1547 }
1548 
1549 
1550 /**
1551  * Generate min(a, b)
1552  * Do checks for special cases but not for nans.
1553  */
1554 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1555 lp_build_min(struct lp_build_context *bld,
1556              LLVMValueRef a,
1557              LLVMValueRef b)
1558 {
1559    assert(lp_check_value(bld->type, a));
1560    assert(lp_check_value(bld->type, b));
1561 
1562    if(a == bld->undef || b == bld->undef)
1563       return bld->undef;
1564 
1565    if(a == b)
1566       return a;
1567 
1568    if (bld->type.norm) {
1569       if (!bld->type.sign) {
1570          if (a == bld->zero || b == bld->zero) {
1571             return bld->zero;
1572          }
1573       }
1574       if(a == bld->one)
1575          return b;
1576       if(b == bld->one)
1577          return a;
1578    }
1579 
1580    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1581 }
1582 
1583 
1584 /**
1585  * Generate min(a, b)
1586  * NaN's are handled according to the behavior specified by the
1587  * nan_behavior argument.
1588  */
1589 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1590 lp_build_min_ext(struct lp_build_context *bld,
1591                  LLVMValueRef a,
1592                  LLVMValueRef b,
1593                  enum gallivm_nan_behavior nan_behavior)
1594 {
1595    assert(lp_check_value(bld->type, a));
1596    assert(lp_check_value(bld->type, b));
1597 
1598    if(a == bld->undef || b == bld->undef)
1599       return bld->undef;
1600 
1601    if(a == b)
1602       return a;
1603 
1604    if (bld->type.norm) {
1605       if (!bld->type.sign) {
1606          if (a == bld->zero || b == bld->zero) {
1607             return bld->zero;
1608          }
1609       }
1610       if(a == bld->one)
1611          return b;
1612       if(b == bld->one)
1613          return a;
1614    }
1615 
1616    return lp_build_min_simple(bld, a, b, nan_behavior);
1617 }
1618 
1619 /**
1620  * Generate max(a, b)
1621  * Do checks for special cases, but NaN behavior is undefined.
1622  */
1623 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1624 lp_build_max(struct lp_build_context *bld,
1625              LLVMValueRef a,
1626              LLVMValueRef b)
1627 {
1628    assert(lp_check_value(bld->type, a));
1629    assert(lp_check_value(bld->type, b));
1630 
1631    if(a == bld->undef || b == bld->undef)
1632       return bld->undef;
1633 
1634    if(a == b)
1635       return a;
1636 
1637    if(bld->type.norm) {
1638       if(a == bld->one || b == bld->one)
1639          return bld->one;
1640       if (!bld->type.sign) {
1641          if (a == bld->zero) {
1642             return b;
1643          }
1644          if (b == bld->zero) {
1645             return a;
1646          }
1647       }
1648    }
1649 
1650    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1651 }
1652 
1653 
1654 /**
1655  * Generate max(a, b)
1656  * Checks for special cases.
1657  * NaN's are handled according to the behavior specified by the
1658  * nan_behavior argument.
1659  */
1660 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1661 lp_build_max_ext(struct lp_build_context *bld,
1662                   LLVMValueRef a,
1663                   LLVMValueRef b,
1664                   enum gallivm_nan_behavior nan_behavior)
1665 {
1666    assert(lp_check_value(bld->type, a));
1667    assert(lp_check_value(bld->type, b));
1668 
1669    if(a == bld->undef || b == bld->undef)
1670       return bld->undef;
1671 
1672    if(a == b)
1673       return a;
1674 
1675    if(bld->type.norm) {
1676       if(a == bld->one || b == bld->one)
1677          return bld->one;
1678       if (!bld->type.sign) {
1679          if (a == bld->zero) {
1680             return b;
1681          }
1682          if (b == bld->zero) {
1683             return a;
1684          }
1685       }
1686    }
1687 
1688    return lp_build_max_simple(bld, a, b, nan_behavior);
1689 }
1690 
1691 /**
1692  * Generate clamp(a, min, max)
1693  * NaN behavior (for any of a, min, max) is undefined.
1694  * Do checks for special cases.
1695  */
1696 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1697 lp_build_clamp(struct lp_build_context *bld,
1698                LLVMValueRef a,
1699                LLVMValueRef min,
1700                LLVMValueRef max)
1701 {
1702    assert(lp_check_value(bld->type, a));
1703    assert(lp_check_value(bld->type, min));
1704    assert(lp_check_value(bld->type, max));
1705 
1706    a = lp_build_min(bld, a, max);
1707    a = lp_build_max(bld, a, min);
1708    return a;
1709 }
1710 
1711 
1712 /**
1713  * Generate clamp(a, 0, 1)
1714  * A NaN will get converted to zero.
1715  */
1716 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1717 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1718                                 LLVMValueRef a)
1719 {
1720    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1721    a = lp_build_min(bld, a, bld->one);
1722    return a;
1723 }
1724 
1725 
1726 /**
1727  * Generate abs(a)
1728  */
1729 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1730 lp_build_abs(struct lp_build_context *bld,
1731              LLVMValueRef a)
1732 {
1733    LLVMBuilderRef builder = bld->gallivm->builder;
1734    const struct lp_type type = bld->type;
1735    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1736 
1737    assert(lp_check_value(type, a));
1738 
1739    if(!type.sign)
1740       return a;
1741 
1742    if(type.floating) {
1743       char intrinsic[32];
1744       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1745       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1746    }
1747 
1748    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1749       switch(type.width) {
1750       case 8:
1751          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1752       case 16:
1753          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1754       case 32:
1755          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1756       }
1757    }
1758    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1759       switch(type.width) {
1760       case 8:
1761          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1762       case 16:
1763          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1764       case 32:
1765          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1766       }
1767    }
1768 
1769    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1770                           a, LLVMBuildNeg(builder, a, ""));
1771 }
1772 
1773 
1774 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1775 lp_build_negate(struct lp_build_context *bld,
1776                 LLVMValueRef a)
1777 {
1778    LLVMBuilderRef builder = bld->gallivm->builder;
1779 
1780    assert(lp_check_value(bld->type, a));
1781 
1782    if (bld->type.floating)
1783       a = LLVMBuildFNeg(builder, a, "");
1784    else
1785       a = LLVMBuildNeg(builder, a, "");
1786 
1787    return a;
1788 }
1789 
1790 
1791 /** Return -1, 0 or +1 depending on the sign of a */
1792 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1793 lp_build_sgn(struct lp_build_context *bld,
1794              LLVMValueRef a)
1795 {
1796    LLVMBuilderRef builder = bld->gallivm->builder;
1797    const struct lp_type type = bld->type;
1798    LLVMValueRef cond;
1799    LLVMValueRef res;
1800 
1801    assert(lp_check_value(type, a));
1802 
1803    /* Handle non-zero case */
1804    if(!type.sign) {
1805       /* if not zero then sign must be positive */
1806       res = bld->one;
1807    }
1808    else if(type.floating) {
1809       LLVMTypeRef vec_type;
1810       LLVMTypeRef int_type;
1811       LLVMValueRef mask;
1812       LLVMValueRef sign;
1813       LLVMValueRef one;
1814       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1815 
1816       int_type = lp_build_int_vec_type(bld->gallivm, type);
1817       vec_type = lp_build_vec_type(bld->gallivm, type);
1818       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1819 
1820       /* Take the sign bit and add it to 1 constant */
1821       sign = LLVMBuildBitCast(builder, a, int_type, "");
1822       sign = LLVMBuildAnd(builder, sign, mask, "");
1823       one = LLVMConstBitCast(bld->one, int_type);
1824       res = LLVMBuildOr(builder, sign, one, "");
1825       res = LLVMBuildBitCast(builder, res, vec_type, "");
1826    }
1827    else
1828    {
1829       /* signed int/norm/fixed point */
1830       /* could use psign with sse3 and appropriate vectors here */
1831       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1832       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1833       res = lp_build_select(bld, cond, bld->one, minus_one);
1834    }
1835 
1836    /* Handle zero */
1837    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1838    res = lp_build_select(bld, cond, bld->zero, res);
1839 
1840    return res;
1841 }
1842 
1843 
1844 /**
1845  * Set the sign of float vector 'a' according to 'sign'.
1846  * If sign==0, return abs(a).
1847  * If sign==1, return -abs(a);
1848  * Other values for sign produce undefined results.
1849  */
1850 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1851 lp_build_set_sign(struct lp_build_context *bld,
1852                   LLVMValueRef a, LLVMValueRef sign)
1853 {
1854    LLVMBuilderRef builder = bld->gallivm->builder;
1855    const struct lp_type type = bld->type;
1856    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1857    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1858    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1859    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1860                              ~((unsigned long long) 1 << (type.width - 1)));
1861    LLVMValueRef val, res;
1862 
1863    assert(type.floating);
1864    assert(lp_check_value(type, a));
1865 
1866    /* val = reinterpret_cast<int>(a) */
1867    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1868    /* val = val & mask */
1869    val = LLVMBuildAnd(builder, val, mask, "");
1870    /* sign = sign << shift */
1871    sign = LLVMBuildShl(builder, sign, shift, "");
1872    /* res = val | sign */
1873    res = LLVMBuildOr(builder, val, sign, "");
1874    /* res = reinterpret_cast<float>(res) */
1875    res = LLVMBuildBitCast(builder, res, vec_type, "");
1876 
1877    return res;
1878 }
1879 
1880 
1881 /**
1882  * Convert vector of (or scalar) int to vector of (or scalar) float.
1883  */
1884 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1885 lp_build_int_to_float(struct lp_build_context *bld,
1886                       LLVMValueRef a)
1887 {
1888    LLVMBuilderRef builder = bld->gallivm->builder;
1889    const struct lp_type type = bld->type;
1890    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1891 
1892    assert(type.floating);
1893 
1894    return LLVMBuildSIToFP(builder, a, vec_type, "");
1895 }
1896 
1897 static boolean
arch_rounding_available(const struct lp_type type)1898 arch_rounding_available(const struct lp_type type)
1899 {
1900    if ((util_cpu_caps.has_sse4_1 &&
1901        (type.length == 1 || type.width*type.length == 128)) ||
1902        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1903        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1904       return TRUE;
1905    else if ((util_cpu_caps.has_altivec &&
1906             (type.width == 32 && type.length == 4)))
1907       return TRUE;
1908    else if (util_cpu_caps.has_neon)
1909       return TRUE;
1910 
1911    return FALSE;
1912 }
1913 
1914 enum lp_build_round_mode
1915 {
1916    LP_BUILD_ROUND_NEAREST = 0,
1917    LP_BUILD_ROUND_FLOOR = 1,
1918    LP_BUILD_ROUND_CEIL = 2,
1919    LP_BUILD_ROUND_TRUNCATE = 3
1920 };
1921 
1922 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1923 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1924                              LLVMValueRef a)
1925 {
1926    LLVMBuilderRef builder = bld->gallivm->builder;
1927    const struct lp_type type = bld->type;
1928    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1929    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1930    const char *intrinsic;
1931    LLVMValueRef res;
1932 
1933    assert(type.floating);
1934    /* using the double precision conversions is a bit more complicated */
1935    assert(type.width == 32);
1936 
1937    assert(lp_check_value(type, a));
1938    assert(util_cpu_caps.has_sse2);
1939 
1940    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1941    if (type.length == 1) {
1942       LLVMTypeRef vec_type;
1943       LLVMValueRef undef;
1944       LLVMValueRef arg;
1945       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1946 
1947       vec_type = LLVMVectorType(bld->elem_type, 4);
1948 
1949       intrinsic = "llvm.x86.sse.cvtss2si";
1950 
1951       undef = LLVMGetUndef(vec_type);
1952 
1953       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1954 
1955       res = lp_build_intrinsic_unary(builder, intrinsic,
1956                                      ret_type, arg);
1957    }
1958    else {
1959       if (type.width* type.length == 128) {
1960          intrinsic = "llvm.x86.sse2.cvtps2dq";
1961       }
1962       else {
1963          assert(type.width*type.length == 256);
1964          assert(util_cpu_caps.has_avx);
1965 
1966          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1967       }
1968       res = lp_build_intrinsic_unary(builder, intrinsic,
1969                                      ret_type, a);
1970    }
1971 
1972    return res;
1973 }
1974 
1975 
1976 /*
1977  */
1978 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1979 lp_build_round_altivec(struct lp_build_context *bld,
1980                        LLVMValueRef a,
1981                        enum lp_build_round_mode mode)
1982 {
1983    LLVMBuilderRef builder = bld->gallivm->builder;
1984    const struct lp_type type = bld->type;
1985    const char *intrinsic = NULL;
1986 
1987    assert(type.floating);
1988 
1989    assert(lp_check_value(type, a));
1990    assert(util_cpu_caps.has_altivec);
1991 
1992    (void)type;
1993 
1994    switch (mode) {
1995    case LP_BUILD_ROUND_NEAREST:
1996       intrinsic = "llvm.ppc.altivec.vrfin";
1997       break;
1998    case LP_BUILD_ROUND_FLOOR:
1999       intrinsic = "llvm.ppc.altivec.vrfim";
2000       break;
2001    case LP_BUILD_ROUND_CEIL:
2002       intrinsic = "llvm.ppc.altivec.vrfip";
2003       break;
2004    case LP_BUILD_ROUND_TRUNCATE:
2005       intrinsic = "llvm.ppc.altivec.vrfiz";
2006       break;
2007    }
2008 
2009    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2010 }
2011 
2012 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)2013 lp_build_round_arch(struct lp_build_context *bld,
2014                     LLVMValueRef a,
2015                     enum lp_build_round_mode mode)
2016 {
2017    if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2018       LLVMBuilderRef builder = bld->gallivm->builder;
2019       const struct lp_type type = bld->type;
2020       const char *intrinsic_root;
2021       char intrinsic[32];
2022 
2023       assert(type.floating);
2024       assert(lp_check_value(type, a));
2025       (void)type;
2026 
2027       switch (mode) {
2028       case LP_BUILD_ROUND_NEAREST:
2029          intrinsic_root = "llvm.nearbyint";
2030          break;
2031       case LP_BUILD_ROUND_FLOOR:
2032          intrinsic_root = "llvm.floor";
2033          break;
2034       case LP_BUILD_ROUND_CEIL:
2035          intrinsic_root = "llvm.ceil";
2036          break;
2037       case LP_BUILD_ROUND_TRUNCATE:
2038          intrinsic_root = "llvm.trunc";
2039          break;
2040       }
2041 
2042       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2043       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2044    }
2045    else /* (util_cpu_caps.has_altivec) */
2046      return lp_build_round_altivec(bld, a, mode);
2047 }
2048 
2049 /**
2050  * Return the integer part of a float (vector) value (== round toward zero).
2051  * The returned value is a float (vector).
2052  * Ex: trunc(-1.5) = -1.0
2053  */
2054 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2055 lp_build_trunc(struct lp_build_context *bld,
2056                LLVMValueRef a)
2057 {
2058    LLVMBuilderRef builder = bld->gallivm->builder;
2059    const struct lp_type type = bld->type;
2060 
2061    assert(type.floating);
2062    assert(lp_check_value(type, a));
2063 
2064    if (arch_rounding_available(type)) {
2065       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2066    }
2067    else {
2068       const struct lp_type type = bld->type;
2069       struct lp_type inttype;
2070       struct lp_build_context intbld;
2071       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2072       LLVMValueRef trunc, res, anosign, mask;
2073       LLVMTypeRef int_vec_type = bld->int_vec_type;
2074       LLVMTypeRef vec_type = bld->vec_type;
2075 
2076       inttype = type;
2077       inttype.floating = 0;
2078       lp_build_context_init(&intbld, bld->gallivm, inttype);
2079 
2080       /* round by truncation */
2081       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2082       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2083 
2084       /* mask out sign bit */
2085       anosign = lp_build_abs(bld, a);
2086       /*
2087        * mask out all values if anosign > 2^24
2088        * This should work both for large ints (all rounding is no-op for them
2089        * because such floats are always exact) as well as special cases like
2090        * NaNs, Infs (taking advantage of the fact they use max exponent).
2091        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2092        */
2093       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2094       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2095       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2096       return lp_build_select(bld, mask, a, res);
2097    }
2098 }
2099 
2100 
2101 /**
2102  * Return float (vector) rounded to nearest integer (vector).  The returned
2103  * value is a float (vector).
2104  * Ex: round(0.9) = 1.0
2105  * Ex: round(-1.5) = -2.0
2106  */
2107 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2108 lp_build_round(struct lp_build_context *bld,
2109                LLVMValueRef a)
2110 {
2111    LLVMBuilderRef builder = bld->gallivm->builder;
2112    const struct lp_type type = bld->type;
2113 
2114    assert(type.floating);
2115    assert(lp_check_value(type, a));
2116 
2117    if (arch_rounding_available(type)) {
2118       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2119    }
2120    else {
2121       const struct lp_type type = bld->type;
2122       struct lp_type inttype;
2123       struct lp_build_context intbld;
2124       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2125       LLVMValueRef res, anosign, mask;
2126       LLVMTypeRef int_vec_type = bld->int_vec_type;
2127       LLVMTypeRef vec_type = bld->vec_type;
2128 
2129       inttype = type;
2130       inttype.floating = 0;
2131       lp_build_context_init(&intbld, bld->gallivm, inttype);
2132 
2133       res = lp_build_iround(bld, a);
2134       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2135 
2136       /* mask out sign bit */
2137       anosign = lp_build_abs(bld, a);
2138       /*
2139        * mask out all values if anosign > 2^24
2140        * This should work both for large ints (all rounding is no-op for them
2141        * because such floats are always exact) as well as special cases like
2142        * NaNs, Infs (taking advantage of the fact they use max exponent).
2143        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2144        */
2145       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2146       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2147       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2148       return lp_build_select(bld, mask, a, res);
2149    }
2150 }
2151 
2152 
2153 /**
2154  * Return floor of float (vector), result is a float (vector)
2155  * Ex: floor(1.1) = 1.0
2156  * Ex: floor(-1.1) = -2.0
2157  */
2158 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2159 lp_build_floor(struct lp_build_context *bld,
2160                LLVMValueRef a)
2161 {
2162    LLVMBuilderRef builder = bld->gallivm->builder;
2163    const struct lp_type type = bld->type;
2164 
2165    assert(type.floating);
2166    assert(lp_check_value(type, a));
2167 
2168    if (arch_rounding_available(type)) {
2169       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2170    }
2171    else {
2172       const struct lp_type type = bld->type;
2173       struct lp_type inttype;
2174       struct lp_build_context intbld;
2175       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2176       LLVMValueRef trunc, res, anosign, mask;
2177       LLVMTypeRef int_vec_type = bld->int_vec_type;
2178       LLVMTypeRef vec_type = bld->vec_type;
2179 
2180       if (type.width != 32) {
2181          char intrinsic[32];
2182          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2183          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2184       }
2185 
2186       assert(type.width == 32); /* might want to handle doubles at some point */
2187 
2188       inttype = type;
2189       inttype.floating = 0;
2190       lp_build_context_init(&intbld, bld->gallivm, inttype);
2191 
2192       /* round by truncation */
2193       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2194       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2195 
2196       if (type.sign) {
2197          LLVMValueRef tmp;
2198 
2199          /*
2200           * fix values if rounding is wrong (for non-special cases)
2201           * - this is the case if trunc > a
2202           */
2203          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2204          /* tmp = trunc > a ? 1.0 : 0.0 */
2205          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2206          tmp = lp_build_and(&intbld, mask, tmp);
2207          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2208          res = lp_build_sub(bld, res, tmp);
2209       }
2210 
2211       /* mask out sign bit */
2212       anosign = lp_build_abs(bld, a);
2213       /*
2214        * mask out all values if anosign > 2^24
2215        * This should work both for large ints (all rounding is no-op for them
2216        * because such floats are always exact) as well as special cases like
2217        * NaNs, Infs (taking advantage of the fact they use max exponent).
2218        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2219        */
2220       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2221       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2222       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2223       return lp_build_select(bld, mask, a, res);
2224    }
2225 }
2226 
2227 
2228 /**
2229  * Return ceiling of float (vector), returning float (vector).
2230  * Ex: ceil( 1.1) = 2.0
2231  * Ex: ceil(-1.1) = -1.0
2232  */
2233 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2234 lp_build_ceil(struct lp_build_context *bld,
2235               LLVMValueRef a)
2236 {
2237    LLVMBuilderRef builder = bld->gallivm->builder;
2238    const struct lp_type type = bld->type;
2239 
2240    assert(type.floating);
2241    assert(lp_check_value(type, a));
2242 
2243    if (arch_rounding_available(type)) {
2244       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2245    }
2246    else {
2247       const struct lp_type type = bld->type;
2248       struct lp_type inttype;
2249       struct lp_build_context intbld;
2250       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2251       LLVMValueRef trunc, res, anosign, mask, tmp;
2252       LLVMTypeRef int_vec_type = bld->int_vec_type;
2253       LLVMTypeRef vec_type = bld->vec_type;
2254 
2255       if (type.width != 32) {
2256          char intrinsic[32];
2257          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2258          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2259       }
2260 
2261       assert(type.width == 32); /* might want to handle doubles at some point */
2262 
2263       inttype = type;
2264       inttype.floating = 0;
2265       lp_build_context_init(&intbld, bld->gallivm, inttype);
2266 
2267       /* round by truncation */
2268       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2269       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2270 
2271       /*
2272        * fix values if rounding is wrong (for non-special cases)
2273        * - this is the case if trunc < a
2274        */
2275       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2276       /* tmp = trunc < a ? 1.0 : 0.0 */
2277       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2278       tmp = lp_build_and(&intbld, mask, tmp);
2279       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2280       res = lp_build_add(bld, trunc, tmp);
2281 
2282       /* mask out sign bit */
2283       anosign = lp_build_abs(bld, a);
2284       /*
2285        * mask out all values if anosign > 2^24
2286        * This should work both for large ints (all rounding is no-op for them
2287        * because such floats are always exact) as well as special cases like
2288        * NaNs, Infs (taking advantage of the fact they use max exponent).
2289        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2290        */
2291       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2292       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2293       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2294       return lp_build_select(bld, mask, a, res);
2295    }
2296 }
2297 
2298 
2299 /**
2300  * Return fractional part of 'a' computed as a - floor(a)
2301  * Typically used in texture coord arithmetic.
2302  */
2303 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2304 lp_build_fract(struct lp_build_context *bld,
2305                LLVMValueRef a)
2306 {
2307    assert(bld->type.floating);
2308    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2309 }
2310 
2311 
2312 /**
2313  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2314  * against 0.99999(9). (Will also return that value for NaNs.)
2315  */
2316 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2317 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2318 {
2319    LLVMValueRef max;
2320 
2321    /* this is the largest number smaller than 1.0 representable as float */
2322    max = lp_build_const_vec(bld->gallivm, bld->type,
2323                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2324    return lp_build_min_ext(bld, fract, max,
2325                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2326 }
2327 
2328 
2329 /**
2330  * Same as lp_build_fract, but guarantees that the result is always smaller
2331  * than one. Will also return the smaller-than-one value for infs, NaNs.
2332  */
2333 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2334 lp_build_fract_safe(struct lp_build_context *bld,
2335                     LLVMValueRef a)
2336 {
2337    return clamp_fract(bld, lp_build_fract(bld, a));
2338 }
2339 
2340 
2341 /**
2342  * Return the integer part of a float (vector) value (== round toward zero).
2343  * The returned value is an integer (vector).
2344  * Ex: itrunc(-1.5) = -1
2345  */
2346 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2347 lp_build_itrunc(struct lp_build_context *bld,
2348                 LLVMValueRef a)
2349 {
2350    LLVMBuilderRef builder = bld->gallivm->builder;
2351    const struct lp_type type = bld->type;
2352    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2353 
2354    assert(type.floating);
2355    assert(lp_check_value(type, a));
2356 
2357    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2358 }
2359 
2360 
2361 /**
2362  * Return float (vector) rounded to nearest integer (vector).  The returned
2363  * value is an integer (vector).
2364  * Ex: iround(0.9) = 1
2365  * Ex: iround(-1.5) = -2
2366  */
2367 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2368 lp_build_iround(struct lp_build_context *bld,
2369                 LLVMValueRef a)
2370 {
2371    LLVMBuilderRef builder = bld->gallivm->builder;
2372    const struct lp_type type = bld->type;
2373    LLVMTypeRef int_vec_type = bld->int_vec_type;
2374    LLVMValueRef res;
2375 
2376    assert(type.floating);
2377 
2378    assert(lp_check_value(type, a));
2379 
2380    if ((util_cpu_caps.has_sse2 &&
2381        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2382        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2383       return lp_build_iround_nearest_sse2(bld, a);
2384    }
2385    if (arch_rounding_available(type)) {
2386       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2387    }
2388    else {
2389       LLVMValueRef half;
2390 
2391       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2392 
2393       if (type.sign) {
2394          LLVMTypeRef vec_type = bld->vec_type;
2395          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2396                                     (unsigned long long)1 << (type.width - 1));
2397          LLVMValueRef sign;
2398 
2399          /* get sign bit */
2400          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2401          sign = LLVMBuildAnd(builder, sign, mask, "");
2402 
2403          /* sign * 0.5 */
2404          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2405          half = LLVMBuildOr(builder, sign, half, "");
2406          half = LLVMBuildBitCast(builder, half, vec_type, "");
2407       }
2408 
2409       res = LLVMBuildFAdd(builder, a, half, "");
2410    }
2411 
2412    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2413 
2414    return res;
2415 }
2416 
2417 
2418 /**
2419  * Return floor of float (vector), result is an int (vector)
2420  * Ex: ifloor(1.1) = 1.0
2421  * Ex: ifloor(-1.1) = -2.0
2422  */
2423 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2424 lp_build_ifloor(struct lp_build_context *bld,
2425                 LLVMValueRef a)
2426 {
2427    LLVMBuilderRef builder = bld->gallivm->builder;
2428    const struct lp_type type = bld->type;
2429    LLVMTypeRef int_vec_type = bld->int_vec_type;
2430    LLVMValueRef res;
2431 
2432    assert(type.floating);
2433    assert(lp_check_value(type, a));
2434 
2435    res = a;
2436    if (type.sign) {
2437       if (arch_rounding_available(type)) {
2438          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2439       }
2440       else {
2441          struct lp_type inttype;
2442          struct lp_build_context intbld;
2443          LLVMValueRef trunc, itrunc, mask;
2444 
2445          assert(type.floating);
2446          assert(lp_check_value(type, a));
2447 
2448          inttype = type;
2449          inttype.floating = 0;
2450          lp_build_context_init(&intbld, bld->gallivm, inttype);
2451 
2452          /* round by truncation */
2453          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2454          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2455 
2456          /*
2457           * fix values if rounding is wrong (for non-special cases)
2458           * - this is the case if trunc > a
2459           * The results of doing this with NaNs, very large values etc.
2460           * are undefined but this seems to be the case anyway.
2461           */
2462          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2463          /* cheapie minus one with mask since the mask is minus one / zero */
2464          return lp_build_add(&intbld, itrunc, mask);
2465       }
2466    }
2467 
2468    /* round to nearest (toward zero) */
2469    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2470 
2471    return res;
2472 }
2473 
2474 
2475 /**
2476  * Return ceiling of float (vector), returning int (vector).
2477  * Ex: iceil( 1.1) = 2
2478  * Ex: iceil(-1.1) = -1
2479  */
2480 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2481 lp_build_iceil(struct lp_build_context *bld,
2482                LLVMValueRef a)
2483 {
2484    LLVMBuilderRef builder = bld->gallivm->builder;
2485    const struct lp_type type = bld->type;
2486    LLVMTypeRef int_vec_type = bld->int_vec_type;
2487    LLVMValueRef res;
2488 
2489    assert(type.floating);
2490    assert(lp_check_value(type, a));
2491 
2492    if (arch_rounding_available(type)) {
2493       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2494    }
2495    else {
2496       struct lp_type inttype;
2497       struct lp_build_context intbld;
2498       LLVMValueRef trunc, itrunc, mask;
2499 
2500       assert(type.floating);
2501       assert(lp_check_value(type, a));
2502 
2503       inttype = type;
2504       inttype.floating = 0;
2505       lp_build_context_init(&intbld, bld->gallivm, inttype);
2506 
2507       /* round by truncation */
2508       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2509       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2510 
2511       /*
2512        * fix values if rounding is wrong (for non-special cases)
2513        * - this is the case if trunc < a
2514        * The results of doing this with NaNs, very large values etc.
2515        * are undefined but this seems to be the case anyway.
2516        */
2517       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2518       /* cheapie plus one with mask since the mask is minus one / zero */
2519       return lp_build_sub(&intbld, itrunc, mask);
2520    }
2521 
2522    /* round to nearest (toward zero) */
2523    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2524 
2525    return res;
2526 }
2527 
2528 
2529 /**
2530  * Combined ifloor() & fract().
2531  *
2532  * Preferred to calling the functions separately, as it will ensure that the
2533  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2534  */
2535 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2536 lp_build_ifloor_fract(struct lp_build_context *bld,
2537                       LLVMValueRef a,
2538                       LLVMValueRef *out_ipart,
2539                       LLVMValueRef *out_fpart)
2540 {
2541    LLVMBuilderRef builder = bld->gallivm->builder;
2542    const struct lp_type type = bld->type;
2543    LLVMValueRef ipart;
2544 
2545    assert(type.floating);
2546    assert(lp_check_value(type, a));
2547 
2548    if (arch_rounding_available(type)) {
2549       /*
2550        * floor() is easier.
2551        */
2552 
2553       ipart = lp_build_floor(bld, a);
2554       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2555       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2556    }
2557    else {
2558       /*
2559        * ifloor() is easier.
2560        */
2561 
2562       *out_ipart = lp_build_ifloor(bld, a);
2563       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2564       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2565    }
2566 }
2567 
2568 
2569 /**
2570  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2571  * always smaller than one.
2572  */
2573 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2574 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2575                            LLVMValueRef a,
2576                            LLVMValueRef *out_ipart,
2577                            LLVMValueRef *out_fpart)
2578 {
2579    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2580    *out_fpart = clamp_fract(bld, *out_fpart);
2581 }
2582 
2583 
2584 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2585 lp_build_sqrt(struct lp_build_context *bld,
2586               LLVMValueRef a)
2587 {
2588    LLVMBuilderRef builder = bld->gallivm->builder;
2589    const struct lp_type type = bld->type;
2590    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2591    char intrinsic[32];
2592 
2593    assert(lp_check_value(type, a));
2594 
2595    assert(type.floating);
2596    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2597 
2598    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2599 }
2600 
2601 
2602 /**
2603  * Do one Newton-Raphson step to improve reciprocate precision:
2604  *
2605  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2606  *
2607  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2608  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2609  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2610  * halo. It would be necessary to clamp the argument to prevent this.
2611  *
2612  * See also:
2613  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2614  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2615  */
2616 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2617 lp_build_rcp_refine(struct lp_build_context *bld,
2618                     LLVMValueRef a,
2619                     LLVMValueRef rcp_a)
2620 {
2621    LLVMBuilderRef builder = bld->gallivm->builder;
2622    LLVMValueRef neg_a;
2623    LLVMValueRef res;
2624 
2625    neg_a = LLVMBuildFNeg(builder, a, "");
2626    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2627    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2628 
2629    return res;
2630 }
2631 
2632 
2633 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2634 lp_build_rcp(struct lp_build_context *bld,
2635              LLVMValueRef a)
2636 {
2637    LLVMBuilderRef builder = bld->gallivm->builder;
2638    const struct lp_type type = bld->type;
2639 
2640    assert(lp_check_value(type, a));
2641 
2642    if(a == bld->zero)
2643       return bld->undef;
2644    if(a == bld->one)
2645       return bld->one;
2646    if(a == bld->undef)
2647       return bld->undef;
2648 
2649    assert(type.floating);
2650 
2651    if(LLVMIsConstant(a))
2652       return LLVMConstFDiv(bld->one, a);
2653 
2654    /*
2655     * We don't use RCPPS because:
2656     * - it only has 10bits of precision
2657     * - it doesn't even get the reciprocate of 1.0 exactly
2658     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2659     * - for recent processors the benefit over DIVPS is marginal, a case
2660     *   dependent
2661     *
2662     * We could still use it on certain processors if benchmarks show that the
2663     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2664     * particular uses that require less workarounds.
2665     */
2666 
2667    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2668          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2669       const unsigned num_iterations = 0;
2670       LLVMValueRef res;
2671       unsigned i;
2672       const char *intrinsic = NULL;
2673 
2674       if (type.length == 4) {
2675          intrinsic = "llvm.x86.sse.rcp.ps";
2676       }
2677       else {
2678          intrinsic = "llvm.x86.avx.rcp.ps.256";
2679       }
2680 
2681       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2682 
2683       for (i = 0; i < num_iterations; ++i) {
2684          res = lp_build_rcp_refine(bld, a, res);
2685       }
2686 
2687       return res;
2688    }
2689 
2690    return LLVMBuildFDiv(builder, bld->one, a, "");
2691 }
2692 
2693 
2694 /**
2695  * Do one Newton-Raphson step to improve rsqrt precision:
2696  *
2697  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2698  *
2699  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2700  */
2701 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2702 lp_build_rsqrt_refine(struct lp_build_context *bld,
2703                       LLVMValueRef a,
2704                       LLVMValueRef rsqrt_a)
2705 {
2706    LLVMBuilderRef builder = bld->gallivm->builder;
2707    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2708    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2709    LLVMValueRef res;
2710 
2711    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2712    res = LLVMBuildFMul(builder, a, res, "");
2713    res = LLVMBuildFSub(builder, three, res, "");
2714    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2715    res = LLVMBuildFMul(builder, half, res, "");
2716 
2717    return res;
2718 }
2719 
2720 
2721 /**
2722  * Generate 1/sqrt(a).
2723  * Result is undefined for values < 0, infinity for +0.
2724  */
2725 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2726 lp_build_rsqrt(struct lp_build_context *bld,
2727                LLVMValueRef a)
2728 {
2729    const struct lp_type type = bld->type;
2730 
2731    assert(lp_check_value(type, a));
2732 
2733    assert(type.floating);
2734 
2735    /*
2736     * This should be faster but all denormals will end up as infinity.
2737     */
2738    if (0 && lp_build_fast_rsqrt_available(type)) {
2739       const unsigned num_iterations = 1;
2740       LLVMValueRef res;
2741       unsigned i;
2742 
2743       /* rsqrt(1.0) != 1.0 here */
2744       res = lp_build_fast_rsqrt(bld, a);
2745 
2746       if (num_iterations) {
2747          /*
2748           * Newton-Raphson will result in NaN instead of infinity for zero,
2749           * and NaN instead of zero for infinity.
2750           * Also, need to ensure rsqrt(1.0) == 1.0.
2751           * All numbers smaller than FLT_MIN will result in +infinity
2752           * (rsqrtps treats all denormals as zero).
2753           */
2754          LLVMValueRef cmp;
2755          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2756          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2757 
2758          for (i = 0; i < num_iterations; ++i) {
2759             res = lp_build_rsqrt_refine(bld, a, res);
2760          }
2761          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2762          res = lp_build_select(bld, cmp, inf, res);
2763          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2764          res = lp_build_select(bld, cmp, bld->zero, res);
2765          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2766          res = lp_build_select(bld, cmp, bld->one, res);
2767       }
2768 
2769       return res;
2770    }
2771 
2772    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2773 }
2774 
2775 /**
2776  * If there's a fast (inaccurate) rsqrt instruction available
2777  * (caller may want to avoid to call rsqrt_fast if it's not available,
2778  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2779  * unavailable it would result in sqrt/div/mul so obviously
2780  * much better to just call sqrt, skipping both div and mul).
2781  */
2782 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2783 lp_build_fast_rsqrt_available(struct lp_type type)
2784 {
2785    assert(type.floating);
2786 
2787    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2788        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2789       return true;
2790    }
2791    return false;
2792 }
2793 
2794 
2795 /**
2796  * Generate 1/sqrt(a).
2797  * Result is undefined for values < 0, infinity for +0.
2798  * Precision is limited, only ~10 bits guaranteed
2799  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2800  */
2801 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2802 lp_build_fast_rsqrt(struct lp_build_context *bld,
2803                     LLVMValueRef a)
2804 {
2805    LLVMBuilderRef builder = bld->gallivm->builder;
2806    const struct lp_type type = bld->type;
2807 
2808    assert(lp_check_value(type, a));
2809 
2810    if (lp_build_fast_rsqrt_available(type)) {
2811       const char *intrinsic = NULL;
2812 
2813       if (type.length == 4) {
2814          intrinsic = "llvm.x86.sse.rsqrt.ps";
2815       }
2816       else {
2817          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2818       }
2819       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2820    }
2821    else {
2822       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2823    }
2824    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2825 }
2826 
2827 
2828 /**
2829  * Generate sin(a) or cos(a) using polynomial approximation.
2830  * TODO: it might be worth recognizing sin and cos using same source
2831  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2832  * would be way cheaper than calculating (nearly) everything twice...
2833  * Not sure it's common enough to be worth bothering however, scs
2834  * opcode could also benefit from calculating both though.
2835  */
2836 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2837 lp_build_sin_or_cos(struct lp_build_context *bld,
2838                     LLVMValueRef a,
2839                     boolean cos)
2840 {
2841    struct gallivm_state *gallivm = bld->gallivm;
2842    LLVMBuilderRef b = gallivm->builder;
2843    struct lp_type int_type = lp_int_type(bld->type);
2844 
2845    /*
2846     *  take the absolute value,
2847     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2848     */
2849 
2850    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2851    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2852 
2853    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2854    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2855 
2856    /*
2857     * scale by 4/Pi
2858     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2859     */
2860 
2861    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2862    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2863 
2864    /*
2865     * store the integer part of y in mm0
2866     * emm2 = _mm_cvttps_epi32(y);
2867     */
2868 
2869    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2870 
2871    /*
2872     * j=(j+1) & (~1) (see the cephes sources)
2873     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2874     */
2875 
2876    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2877    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2878    /*
2879     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2880     */
2881    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2882    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2883 
2884    /*
2885     * y = _mm_cvtepi32_ps(emm2);
2886     */
2887    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2888 
2889    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2890    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2891    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2892    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2893 
2894    /*
2895     * Argument used for poly selection and sign bit determination
2896     * is different for sin vs. cos.
2897     */
2898    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2899                                emm2_and;
2900 
2901    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2902                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2903                                               const_29, "sign_bit") :
2904                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2905                                                               LLVMBuildShl(b, emm2_add,
2906                                                                            const_29, ""), ""),
2907                                               sign_mask, "sign_bit");
2908 
2909    /*
2910     * get the polynom selection mask
2911     * there is one polynom for 0 <= x <= Pi/4
2912     * and another one for Pi/4<x<=Pi/2
2913     * Both branches will be computed.
2914     *
2915     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2916     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2917     */
2918 
2919    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2920    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2921                                              int_type, PIPE_FUNC_EQUAL,
2922                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2923 
2924    /*
2925     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2926     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2927     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2928     */
2929    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2930    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2931    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2932 
2933    /*
2934     * The magic pass: "Extended precision modular arithmetic"
2935     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2936     */
2937    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2938    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2939    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2940 
2941    /*
2942     * Evaluate the first polynom  (0 <= x <= Pi/4)
2943     *
2944     * z = _mm_mul_ps(x,x);
2945     */
2946    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2947 
2948    /*
2949     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2950     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2951     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2952     */
2953    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2954    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2955    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2956 
2957    /*
2958     * y = *(v4sf*)_ps_coscof_p0;
2959     * y = _mm_mul_ps(y, z);
2960     */
2961    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2962    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2963    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2964    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2965 
2966 
2967    /*
2968     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2969     * y = _mm_sub_ps(y, tmp);
2970     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2971     */
2972    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2973    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2974    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2975    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2976    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2977 
2978    /*
2979     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2980     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2981     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2982     */
2983    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2984    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2985    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2986 
2987    /*
2988     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2989     *
2990     * y2 = *(v4sf*)_ps_sincof_p0;
2991     * y2 = _mm_mul_ps(y2, z);
2992     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2993     * y2 = _mm_mul_ps(y2, z);
2994     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2995     * y2 = _mm_mul_ps(y2, z);
2996     * y2 = _mm_mul_ps(y2, x);
2997     * y2 = _mm_add_ps(y2, x);
2998     */
2999 
3000    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3001    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3002    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3003    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3004 
3005    /*
3006     * select the correct result from the two polynoms
3007     * xmm3 = poly_mask;
3008     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3009     * y = _mm_andnot_ps(xmm3, y);
3010     * y = _mm_or_ps(y,y2);
3011     */
3012    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3013    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3014    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3015    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3016    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3017    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3018 
3019    /*
3020     * update the sign
3021     * y = _mm_xor_ps(y, sign_bit);
3022     */
3023    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3024    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3025 
3026    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3027 
3028    /* clamp output to be within [-1, 1] */
3029    y_result = lp_build_clamp(bld, y_result,
3030                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3031                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3032    /* If a is -inf, inf or NaN then return NaN */
3033    y_result = lp_build_select(bld, isfinite, y_result,
3034                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3035    return y_result;
3036 }
3037 
3038 
3039 /**
3040  * Generate sin(a)
3041  */
3042 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3043 lp_build_sin(struct lp_build_context *bld,
3044              LLVMValueRef a)
3045 {
3046    return lp_build_sin_or_cos(bld, a, FALSE);
3047 }
3048 
3049 
3050 /**
3051  * Generate cos(a)
3052  */
3053 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3054 lp_build_cos(struct lp_build_context *bld,
3055              LLVMValueRef a)
3056 {
3057    return lp_build_sin_or_cos(bld, a, TRUE);
3058 }
3059 
3060 
3061 /**
3062  * Generate pow(x, y)
3063  */
3064 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3065 lp_build_pow(struct lp_build_context *bld,
3066              LLVMValueRef x,
3067              LLVMValueRef y)
3068 {
3069    /* TODO: optimize the constant case */
3070    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3071        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3072       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3073                    __FUNCTION__);
3074    }
3075 
3076    LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3077    LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3078 
3079    res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3080    return res;
3081 }
3082 
3083 
3084 /**
3085  * Generate exp(x)
3086  */
3087 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3088 lp_build_exp(struct lp_build_context *bld,
3089              LLVMValueRef x)
3090 {
3091    /* log2(e) = 1/log(2) */
3092    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3093                                            1.4426950408889634);
3094 
3095    assert(lp_check_value(bld->type, x));
3096 
3097    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3098 }
3099 
3100 
3101 /**
3102  * Generate log(x)
3103  * Behavior is undefined with infs, 0s and nans
3104  */
3105 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3106 lp_build_log(struct lp_build_context *bld,
3107              LLVMValueRef x)
3108 {
3109    /* log(2) */
3110    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3111                                           0.69314718055994529);
3112 
3113    assert(lp_check_value(bld->type, x));
3114 
3115    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3116 }
3117 
3118 /**
3119  * Generate log(x) that handles edge cases (infs, 0s and nans)
3120  */
3121 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3122 lp_build_log_safe(struct lp_build_context *bld,
3123                   LLVMValueRef x)
3124 {
3125    /* log(2) */
3126    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3127                                           0.69314718055994529);
3128 
3129    assert(lp_check_value(bld->type, x));
3130 
3131    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3132 }
3133 
3134 
3135 /**
3136  * Generate polynomial.
3137  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3138  */
3139 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3140 lp_build_polynomial(struct lp_build_context *bld,
3141                     LLVMValueRef x,
3142                     const double *coeffs,
3143                     unsigned num_coeffs)
3144 {
3145    const struct lp_type type = bld->type;
3146    LLVMValueRef even = NULL, odd = NULL;
3147    LLVMValueRef x2;
3148    unsigned i;
3149 
3150    assert(lp_check_value(bld->type, x));
3151 
3152    /* TODO: optimize the constant case */
3153    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3154        LLVMIsConstant(x)) {
3155       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3156                    __FUNCTION__);
3157    }
3158 
3159    /*
3160     * Calculate odd and even terms seperately to decrease data dependency
3161     * Ex:
3162     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3163     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3164     */
3165    x2 = lp_build_mul(bld, x, x);
3166 
3167    for (i = num_coeffs; i--; ) {
3168       LLVMValueRef coeff;
3169 
3170       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3171 
3172       if (i % 2 == 0) {
3173          if (even)
3174             even = lp_build_mad(bld, x2, even, coeff);
3175          else
3176             even = coeff;
3177       } else {
3178          if (odd)
3179             odd = lp_build_mad(bld, x2, odd, coeff);
3180          else
3181             odd = coeff;
3182       }
3183    }
3184 
3185    if (odd)
3186       return lp_build_mad(bld, odd, x, even);
3187    else if (even)
3188       return even;
3189    else
3190       return bld->undef;
3191 }
3192 
3193 
3194 /**
3195  * Minimax polynomial fit of 2**x, in range [0, 1[
3196  */
3197 const double lp_build_exp2_polynomial[] = {
3198 #if EXP_POLY_DEGREE == 5
3199    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3200    0.693153073200168932794,
3201    0.240153617044375388211,
3202    0.0558263180532956664775,
3203    0.00898934009049466391101,
3204    0.00187757667519147912699
3205 #elif EXP_POLY_DEGREE == 4
3206    1.00000259337069434683,
3207    0.693003834469974940458,
3208    0.24144275689150793076,
3209    0.0520114606103070150235,
3210    0.0135341679161270268764
3211 #elif EXP_POLY_DEGREE == 3
3212    0.999925218562710312959,
3213    0.695833540494823811697,
3214    0.226067155427249155588,
3215    0.0780245226406372992967
3216 #elif EXP_POLY_DEGREE == 2
3217    1.00172476321474503578,
3218    0.657636275736077639316,
3219    0.33718943461968720704
3220 #else
3221 #error
3222 #endif
3223 };
3224 
3225 
3226 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3227 lp_build_exp2(struct lp_build_context *bld,
3228               LLVMValueRef x)
3229 {
3230    LLVMBuilderRef builder = bld->gallivm->builder;
3231    const struct lp_type type = bld->type;
3232    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3233    LLVMValueRef ipart = NULL;
3234    LLVMValueRef fpart = NULL;
3235    LLVMValueRef expipart = NULL;
3236    LLVMValueRef expfpart = NULL;
3237    LLVMValueRef res = NULL;
3238 
3239    assert(lp_check_value(bld->type, x));
3240 
3241    /* TODO: optimize the constant case */
3242    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3243        LLVMIsConstant(x)) {
3244       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3245                    __FUNCTION__);
3246    }
3247 
3248    assert(type.floating && type.width == 32);
3249 
3250    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3251     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3252    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3253                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3254    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3255                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3256 
3257    /* ipart = floor(x) */
3258    /* fpart = x - ipart */
3259    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3260 
3261    /* expipart = (float) (1 << ipart) */
3262    expipart = LLVMBuildAdd(builder, ipart,
3263                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3264    expipart = LLVMBuildShl(builder, expipart,
3265                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3266    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3267 
3268    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3269                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3270 
3271    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3272 
3273    return res;
3274 }
3275 
3276 
3277 
3278 /**
3279  * Extract the exponent of a IEEE-754 floating point value.
3280  *
3281  * Optionally apply an integer bias.
3282  *
3283  * Result is an integer value with
3284  *
3285  *   ifloor(log2(x)) + bias
3286  */
3287 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3288 lp_build_extract_exponent(struct lp_build_context *bld,
3289                           LLVMValueRef x,
3290                           int bias)
3291 {
3292    LLVMBuilderRef builder = bld->gallivm->builder;
3293    const struct lp_type type = bld->type;
3294    unsigned mantissa = lp_mantissa(type);
3295    LLVMValueRef res;
3296 
3297    assert(type.floating);
3298 
3299    assert(lp_check_value(bld->type, x));
3300 
3301    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3302 
3303    res = LLVMBuildLShr(builder, x,
3304                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3305    res = LLVMBuildAnd(builder, res,
3306                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3307    res = LLVMBuildSub(builder, res,
3308                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3309 
3310    return res;
3311 }
3312 
3313 
3314 /**
3315  * Extract the mantissa of the a floating.
3316  *
3317  * Result is a floating point value with
3318  *
3319  *   x / floor(log2(x))
3320  */
3321 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3322 lp_build_extract_mantissa(struct lp_build_context *bld,
3323                           LLVMValueRef x)
3324 {
3325    LLVMBuilderRef builder = bld->gallivm->builder;
3326    const struct lp_type type = bld->type;
3327    unsigned mantissa = lp_mantissa(type);
3328    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3329                                                   (1ULL << mantissa) - 1);
3330    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3331    LLVMValueRef res;
3332 
3333    assert(lp_check_value(bld->type, x));
3334 
3335    assert(type.floating);
3336 
3337    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3338 
3339    /* res = x / 2**ipart */
3340    res = LLVMBuildAnd(builder, x, mantmask, "");
3341    res = LLVMBuildOr(builder, res, one, "");
3342    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3343 
3344    return res;
3345 }
3346 
3347 
3348 
3349 /**
3350  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3351  * These coefficients can be generate with
3352  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3353  */
3354 const double lp_build_log2_polynomial[] = {
3355 #if LOG_POLY_DEGREE == 5
3356    2.88539008148777786488L,
3357    0.961796878841293367824L,
3358    0.577058946784739859012L,
3359    0.412914355135828735411L,
3360    0.308591899232910175289L,
3361    0.352376952300281371868L,
3362 #elif LOG_POLY_DEGREE == 4
3363    2.88539009343309178325L,
3364    0.961791550404184197881L,
3365    0.577440339438736392009L,
3366    0.403343858251329912514L,
3367    0.406718052498846252698L,
3368 #elif LOG_POLY_DEGREE == 3
3369    2.88538959748872753838L,
3370    0.961932915889597772928L,
3371    0.571118517972136195241L,
3372    0.493997535084709500285L,
3373 #else
3374 #error
3375 #endif
3376 };
3377 
3378 /**
3379  * See http://www.devmaster.net/forums/showthread.php?p=43580
3380  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3381  * http://www.nezumi.demon.co.uk/consult/logx.htm
3382  *
3383  * If handle_edge_cases is true the function will perform computations
3384  * to match the required D3D10+ behavior for each of the edge cases.
3385  * That means that if input is:
3386  * - less than zero (to and including -inf) then NaN will be returned
3387  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3388  * - +infinity, then +infinity will be returned
3389  * - NaN, then NaN will be returned
3390  *
3391  * Those checks are fairly expensive so if you don't need them make sure
3392  * handle_edge_cases is false.
3393  */
3394 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3395 lp_build_log2_approx(struct lp_build_context *bld,
3396                      LLVMValueRef x,
3397                      LLVMValueRef *p_exp,
3398                      LLVMValueRef *p_floor_log2,
3399                      LLVMValueRef *p_log2,
3400                      boolean handle_edge_cases)
3401 {
3402    LLVMBuilderRef builder = bld->gallivm->builder;
3403    const struct lp_type type = bld->type;
3404    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3405    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3406 
3407    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3408    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3409    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3410 
3411    LLVMValueRef i = NULL;
3412    LLVMValueRef y = NULL;
3413    LLVMValueRef z = NULL;
3414    LLVMValueRef exp = NULL;
3415    LLVMValueRef mant = NULL;
3416    LLVMValueRef logexp = NULL;
3417    LLVMValueRef p_z = NULL;
3418    LLVMValueRef res = NULL;
3419 
3420    assert(lp_check_value(bld->type, x));
3421 
3422    if(p_exp || p_floor_log2 || p_log2) {
3423       /* TODO: optimize the constant case */
3424       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3425           LLVMIsConstant(x)) {
3426          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3427                       __FUNCTION__);
3428       }
3429 
3430       assert(type.floating && type.width == 32);
3431 
3432       /*
3433        * We don't explicitly handle denormalized numbers. They will yield a
3434        * result in the neighbourhood of -127, which appears to be adequate
3435        * enough.
3436        */
3437 
3438       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3439 
3440       /* exp = (float) exponent(x) */
3441       exp = LLVMBuildAnd(builder, i, expmask, "");
3442    }
3443 
3444    if(p_floor_log2 || p_log2) {
3445       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3446       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3447       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3448    }
3449 
3450    if (p_log2) {
3451       /* mant = 1 + (float) mantissa(x) */
3452       mant = LLVMBuildAnd(builder, i, mantmask, "");
3453       mant = LLVMBuildOr(builder, mant, one, "");
3454       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3455 
3456       /* y = (mant - 1) / (mant + 1) */
3457       y = lp_build_div(bld,
3458          lp_build_sub(bld, mant, bld->one),
3459          lp_build_add(bld, mant, bld->one)
3460       );
3461 
3462       /* z = y^2 */
3463       z = lp_build_mul(bld, y, y);
3464 
3465       /* compute P(z) */
3466       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3467                                 ARRAY_SIZE(lp_build_log2_polynomial));
3468 
3469       /* y * P(z) + logexp */
3470       res = lp_build_mad(bld, y, p_z, logexp);
3471 
3472       if (type.floating && handle_edge_cases) {
3473          LLVMValueRef negmask, infmask,  zmask;
3474          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3475                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3476          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3477                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3478          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3479                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3480 
3481          /* If x is qual to inf make sure we return inf */
3482          res = lp_build_select(bld, infmask,
3483                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3484                                res);
3485          /* If x is qual to 0, return -inf */
3486          res = lp_build_select(bld, zmask,
3487                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3488                                res);
3489          /* If x is nan or less than 0, return nan */
3490          res = lp_build_select(bld, negmask,
3491                                lp_build_const_vec(bld->gallivm, type,  NAN),
3492                                res);
3493       }
3494    }
3495 
3496    if (p_exp) {
3497       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3498       *p_exp = exp;
3499    }
3500 
3501    if (p_floor_log2)
3502       *p_floor_log2 = logexp;
3503 
3504    if (p_log2)
3505       *p_log2 = res;
3506 }
3507 
3508 
3509 /*
3510  * log2 implementation which doesn't have special code to
3511  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3512  * the results for those cases are undefined.
3513  */
3514 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3515 lp_build_log2(struct lp_build_context *bld,
3516               LLVMValueRef x)
3517 {
3518    LLVMValueRef res;
3519    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3520    return res;
3521 }
3522 
3523 /*
3524  * Version of log2 which handles all edge cases.
3525  * Look at documentation of lp_build_log2_approx for
3526  * description of the behavior for each of the edge cases.
3527  */
3528 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3529 lp_build_log2_safe(struct lp_build_context *bld,
3530                    LLVMValueRef x)
3531 {
3532    LLVMValueRef res;
3533    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3534    return res;
3535 }
3536 
3537 
3538 /**
3539  * Faster (and less accurate) log2.
3540  *
3541  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3542  *
3543  * Piece-wise linear approximation, with exact results when x is a
3544  * power of two.
3545  *
3546  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3547  */
3548 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3549 lp_build_fast_log2(struct lp_build_context *bld,
3550                    LLVMValueRef x)
3551 {
3552    LLVMBuilderRef builder = bld->gallivm->builder;
3553    LLVMValueRef ipart;
3554    LLVMValueRef fpart;
3555 
3556    assert(lp_check_value(bld->type, x));
3557 
3558    assert(bld->type.floating);
3559 
3560    /* ipart = floor(log2(x)) - 1 */
3561    ipart = lp_build_extract_exponent(bld, x, -1);
3562    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3563 
3564    /* fpart = x / 2**ipart */
3565    fpart = lp_build_extract_mantissa(bld, x);
3566 
3567    /* ipart + fpart */
3568    return LLVMBuildFAdd(builder, ipart, fpart, "");
3569 }
3570 
3571 
3572 /**
3573  * Fast implementation of iround(log2(x)).
3574  *
3575  * Not an approximation -- it should give accurate results all the time.
3576  */
3577 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3578 lp_build_ilog2(struct lp_build_context *bld,
3579                LLVMValueRef x)
3580 {
3581    LLVMBuilderRef builder = bld->gallivm->builder;
3582    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3583    LLVMValueRef ipart;
3584 
3585    assert(bld->type.floating);
3586 
3587    assert(lp_check_value(bld->type, x));
3588 
3589    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3590    x = LLVMBuildFMul(builder, x, sqrt2, "");
3591 
3592    /* ipart = floor(log2(x) + 0.5)  */
3593    ipart = lp_build_extract_exponent(bld, x, 0);
3594 
3595    return ipart;
3596 }
3597 
3598 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3599 lp_build_mod(struct lp_build_context *bld,
3600              LLVMValueRef x,
3601              LLVMValueRef y)
3602 {
3603    LLVMBuilderRef builder = bld->gallivm->builder;
3604    LLVMValueRef res;
3605    const struct lp_type type = bld->type;
3606 
3607    assert(lp_check_value(type, x));
3608    assert(lp_check_value(type, y));
3609 
3610    if (type.floating)
3611       res = LLVMBuildFRem(builder, x, y, "");
3612    else if (type.sign)
3613       res = LLVMBuildSRem(builder, x, y, "");
3614    else
3615       res = LLVMBuildURem(builder, x, y, "");
3616    return res;
3617 }
3618 
3619 
3620 /*
3621  * For floating inputs it creates and returns a mask
3622  * which is all 1's for channels which are NaN.
3623  * Channels inside x which are not NaN will be 0.
3624  */
3625 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3626 lp_build_isnan(struct lp_build_context *bld,
3627                LLVMValueRef x)
3628 {
3629    LLVMValueRef mask;
3630    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3631 
3632    assert(bld->type.floating);
3633    assert(lp_check_value(bld->type, x));
3634 
3635    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3636                         "isnotnan");
3637    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3638    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3639    return mask;
3640 }
3641 
3642 /* Returns all 1's for floating point numbers that are
3643  * finite numbers and returns all zeros for -inf,
3644  * inf and nan's */
3645 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3646 lp_build_isfinite(struct lp_build_context *bld,
3647                   LLVMValueRef x)
3648 {
3649    LLVMBuilderRef builder = bld->gallivm->builder;
3650    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3651    struct lp_type int_type = lp_int_type(bld->type);
3652    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3653    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3654                                                     0x7f800000);
3655 
3656    if (!bld->type.floating) {
3657       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3658    }
3659    assert(bld->type.floating);
3660    assert(lp_check_value(bld->type, x));
3661    assert(bld->type.width == 32);
3662 
3663    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3664    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3665                            intx, infornan32);
3666 }
3667 
3668 /*
3669  * Returns true if the number is nan or inf and false otherwise.
3670  * The input has to be a floating point vector.
3671  */
3672 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3673 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3674                        const struct lp_type type,
3675                        LLVMValueRef x)
3676 {
3677    LLVMBuilderRef builder = gallivm->builder;
3678    struct lp_type int_type = lp_int_type(type);
3679    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3680                                                 0x7f800000);
3681    LLVMValueRef ret;
3682 
3683    assert(type.floating);
3684 
3685    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3686    ret = LLVMBuildAnd(builder, ret, const0, "");
3687    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3688                           ret, const0);
3689 
3690    return ret;
3691 }
3692 
3693 
3694 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3695 lp_build_fpstate_get(struct gallivm_state *gallivm)
3696 {
3697    if (util_cpu_caps.has_sse) {
3698       LLVMBuilderRef builder = gallivm->builder;
3699       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3700          gallivm,
3701          LLVMInt32TypeInContext(gallivm->context),
3702          "mxcsr_ptr");
3703       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3704           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3705       lp_build_intrinsic(builder,
3706                          "llvm.x86.sse.stmxcsr",
3707                          LLVMVoidTypeInContext(gallivm->context),
3708                          &mxcsr_ptr8, 1, 0);
3709       return mxcsr_ptr;
3710    }
3711    return 0;
3712 }
3713 
3714 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3715 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3716                                   boolean zero)
3717 {
3718    if (util_cpu_caps.has_sse) {
3719       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3720       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3721 
3722       LLVMBuilderRef builder = gallivm->builder;
3723       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3724       LLVMValueRef mxcsr =
3725          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3726 
3727       if (util_cpu_caps.has_daz) {
3728          /* Enable denormals are zero mode */
3729          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3730       }
3731       if (zero) {
3732          mxcsr = LLVMBuildOr(builder, mxcsr,
3733                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3734       } else {
3735          mxcsr = LLVMBuildAnd(builder, mxcsr,
3736                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3737       }
3738 
3739       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3740       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3741    }
3742 }
3743 
3744 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3745 lp_build_fpstate_set(struct gallivm_state *gallivm,
3746                      LLVMValueRef mxcsr_ptr)
3747 {
3748    if (util_cpu_caps.has_sse) {
3749       LLVMBuilderRef builder = gallivm->builder;
3750       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3751                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3752       lp_build_intrinsic(builder,
3753                          "llvm.x86.sse.ldmxcsr",
3754                          LLVMVoidTypeInContext(gallivm->context),
3755                          &mxcsr_ptr, 1, 0);
3756    }
3757 }
3758