1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_intr.h"
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
62
63
64 #define EXP_POLY_DEGREE 5
65
66 #define LOG_POLY_DEGREE 4
67
68
69 /**
70 * Generate min(a, b)
71 * No checks for special case values of a or b = 1 or 0 are done.
72 */
73 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)74 lp_build_min_simple(struct lp_build_context *bld,
75 LLVMValueRef a,
76 LLVMValueRef b)
77 {
78 const struct lp_type type = bld->type;
79 const char *intrinsic = NULL;
80 unsigned intr_size = 0;
81 LLVMValueRef cond;
82
83 assert(lp_check_value(type, a));
84 assert(lp_check_value(type, b));
85
86 /* TODO: optimize the constant case */
87
88 if (type.floating && util_cpu_caps.has_sse) {
89 if (type.width == 32) {
90 if (type.length == 1) {
91 intrinsic = "llvm.x86.sse.min.ss";
92 intr_size = 128;
93 }
94 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
95 intrinsic = "llvm.x86.sse.min.ps";
96 intr_size = 128;
97 }
98 else {
99 intrinsic = "llvm.x86.avx.min.ps.256";
100 intr_size = 256;
101 }
102 }
103 if (type.width == 64 && util_cpu_caps.has_sse2) {
104 if (type.length == 1) {
105 intrinsic = "llvm.x86.sse2.min.sd";
106 intr_size = 128;
107 }
108 else if (type.length == 2 || !util_cpu_caps.has_avx) {
109 intrinsic = "llvm.x86.sse2.min.pd";
110 intr_size = 128;
111 }
112 else {
113 intrinsic = "llvm.x86.avx.min.pd.256";
114 intr_size = 256;
115 }
116 }
117 }
118 else if (util_cpu_caps.has_sse2 && type.length >= 2) {
119 intr_size = 128;
120 if ((type.width == 8 || type.width == 16) &&
121 (type.width * type.length <= 64) &&
122 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
123 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
124 __FUNCTION__);
125 }
126 if (type.width == 8 && !type.sign) {
127 intrinsic = "llvm.x86.sse2.pminu.b";
128 }
129 else if (type.width == 16 && type.sign) {
130 intrinsic = "llvm.x86.sse2.pmins.w";
131 }
132 if (util_cpu_caps.has_sse4_1) {
133 if (type.width == 8 && type.sign) {
134 intrinsic = "llvm.x86.sse41.pminsb";
135 }
136 if (type.width == 16 && !type.sign) {
137 intrinsic = "llvm.x86.sse41.pminuw";
138 }
139 if (type.width == 32 && !type.sign) {
140 intrinsic = "llvm.x86.sse41.pminud";
141 }
142 if (type.width == 32 && type.sign) {
143 intrinsic = "llvm.x86.sse41.pminsd";
144 }
145 }
146 }
147
148 if(intrinsic) {
149 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
150 type,
151 intr_size, a, b);
152 }
153
154 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
155 return lp_build_select(bld, cond, a, b);
156 }
157
158
159 /**
160 * Generate max(a, b)
161 * No checks for special case values of a or b = 1 or 0 are done.
162 */
163 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)164 lp_build_max_simple(struct lp_build_context *bld,
165 LLVMValueRef a,
166 LLVMValueRef b)
167 {
168 const struct lp_type type = bld->type;
169 const char *intrinsic = NULL;
170 unsigned intr_size = 0;
171 LLVMValueRef cond;
172
173 assert(lp_check_value(type, a));
174 assert(lp_check_value(type, b));
175
176 /* TODO: optimize the constant case */
177
178 if (type.floating && util_cpu_caps.has_sse) {
179 if (type.width == 32) {
180 if (type.length == 1) {
181 intrinsic = "llvm.x86.sse.max.ss";
182 intr_size = 128;
183 }
184 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
185 intrinsic = "llvm.x86.sse.max.ps";
186 intr_size = 128;
187 }
188 else {
189 intrinsic = "llvm.x86.avx.max.ps.256";
190 intr_size = 256;
191 }
192 }
193 if (type.width == 64 && util_cpu_caps.has_sse2) {
194 if (type.length == 1) {
195 intrinsic = "llvm.x86.sse2.max.sd";
196 intr_size = 128;
197 }
198 else if (type.length == 2 || !util_cpu_caps.has_avx) {
199 intrinsic = "llvm.x86.sse2.max.pd";
200 intr_size = 128;
201 }
202 else {
203 intrinsic = "llvm.x86.avx.max.pd.256";
204 intr_size = 256;
205 }
206 }
207 }
208 else if (util_cpu_caps.has_sse2 && type.length >= 2) {
209 intr_size = 128;
210 if ((type.width == 8 || type.width == 16) &&
211 (type.width * type.length <= 64) &&
212 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
213 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
214 __FUNCTION__);
215 }
216 if (type.width == 8 && !type.sign) {
217 intrinsic = "llvm.x86.sse2.pmaxu.b";
218 intr_size = 128;
219 }
220 else if (type.width == 16 && type.sign) {
221 intrinsic = "llvm.x86.sse2.pmaxs.w";
222 }
223 if (util_cpu_caps.has_sse4_1) {
224 if (type.width == 8 && type.sign) {
225 intrinsic = "llvm.x86.sse41.pmaxsb";
226 }
227 if (type.width == 16 && !type.sign) {
228 intrinsic = "llvm.x86.sse41.pmaxuw";
229 }
230 if (type.width == 32 && !type.sign) {
231 intrinsic = "llvm.x86.sse41.pmaxud";
232 }
233 if (type.width == 32 && type.sign) {
234 intrinsic = "llvm.x86.sse41.pmaxsd";
235 }
236 }
237 }
238
239 if(intrinsic) {
240 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
241 type,
242 intr_size, a, b);
243 }
244
245 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
246 return lp_build_select(bld, cond, a, b);
247 }
248
249
250 /**
251 * Generate 1 - a, or ~a depending on bld->type.
252 */
253 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)254 lp_build_comp(struct lp_build_context *bld,
255 LLVMValueRef a)
256 {
257 LLVMBuilderRef builder = bld->gallivm->builder;
258 const struct lp_type type = bld->type;
259
260 assert(lp_check_value(type, a));
261
262 if(a == bld->one)
263 return bld->zero;
264 if(a == bld->zero)
265 return bld->one;
266
267 if(type.norm && !type.floating && !type.fixed && !type.sign) {
268 if(LLVMIsConstant(a))
269 return LLVMConstNot(a);
270 else
271 return LLVMBuildNot(builder, a, "");
272 }
273
274 if(LLVMIsConstant(a))
275 if (type.floating)
276 return LLVMConstFSub(bld->one, a);
277 else
278 return LLVMConstSub(bld->one, a);
279 else
280 if (type.floating)
281 return LLVMBuildFSub(builder, bld->one, a, "");
282 else
283 return LLVMBuildSub(builder, bld->one, a, "");
284 }
285
286
287 /**
288 * Generate a + b
289 */
290 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)291 lp_build_add(struct lp_build_context *bld,
292 LLVMValueRef a,
293 LLVMValueRef b)
294 {
295 LLVMBuilderRef builder = bld->gallivm->builder;
296 const struct lp_type type = bld->type;
297 LLVMValueRef res;
298
299 assert(lp_check_value(type, a));
300 assert(lp_check_value(type, b));
301
302 if(a == bld->zero)
303 return b;
304 if(b == bld->zero)
305 return a;
306 if(a == bld->undef || b == bld->undef)
307 return bld->undef;
308
309 if(bld->type.norm) {
310 const char *intrinsic = NULL;
311
312 if(a == bld->one || b == bld->one)
313 return bld->one;
314
315 if(util_cpu_caps.has_sse2 &&
316 type.width * type.length == 128 &&
317 !type.floating && !type.fixed) {
318 if(type.width == 8)
319 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
320 if(type.width == 16)
321 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
322 }
323
324 if(intrinsic)
325 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
326 }
327
328 if(LLVMIsConstant(a) && LLVMIsConstant(b))
329 if (type.floating)
330 res = LLVMConstFAdd(a, b);
331 else
332 res = LLVMConstAdd(a, b);
333 else
334 if (type.floating)
335 res = LLVMBuildFAdd(builder, a, b, "");
336 else
337 res = LLVMBuildAdd(builder, a, b, "");
338
339 /* clamp to ceiling of 1.0 */
340 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
341 res = lp_build_min_simple(bld, res, bld->one);
342
343 /* XXX clamp to floor of -1 or 0??? */
344
345 return res;
346 }
347
348
349 /** Return the scalar sum of the elements of a.
350 * Should avoid this operation whenever possible.
351 */
352 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)353 lp_build_horizontal_add(struct lp_build_context *bld,
354 LLVMValueRef a)
355 {
356 LLVMBuilderRef builder = bld->gallivm->builder;
357 const struct lp_type type = bld->type;
358 LLVMValueRef index, res;
359 unsigned i, length;
360 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
361 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
362 LLVMValueRef vecres, elem2;
363
364 assert(lp_check_value(type, a));
365
366 if (type.length == 1) {
367 return a;
368 }
369
370 assert(!bld->type.norm);
371
372 /*
373 * for byte vectors can do much better with psadbw.
374 * Using repeated shuffle/adds here. Note with multiple vectors
375 * this can be done more efficiently as outlined in the intel
376 * optimization manual.
377 * Note: could cause data rearrangement if used with smaller element
378 * sizes.
379 */
380
381 vecres = a;
382 length = type.length / 2;
383 while (length > 1) {
384 LLVMValueRef vec1, vec2;
385 for (i = 0; i < length; i++) {
386 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
387 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
388 }
389 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
390 LLVMConstVector(shuffles1, length), "");
391 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
392 LLVMConstVector(shuffles2, length), "");
393 if (type.floating) {
394 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
395 }
396 else {
397 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
398 }
399 length = length >> 1;
400 }
401
402 /* always have vector of size 2 here */
403 assert(length == 1);
404
405 index = lp_build_const_int32(bld->gallivm, 0);
406 res = LLVMBuildExtractElement(builder, vecres, index, "");
407 index = lp_build_const_int32(bld->gallivm, 1);
408 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
409
410 if (type.floating)
411 res = LLVMBuildFAdd(builder, res, elem2, "");
412 else
413 res = LLVMBuildAdd(builder, res, elem2, "");
414
415 return res;
416 }
417
418 /**
419 * Return the horizontal sums of 4 float vectors as a float4 vector.
420 * This uses the technique as outlined in Intel Optimization Manual.
421 */
422 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])423 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
424 LLVMValueRef src[4])
425 {
426 struct gallivm_state *gallivm = bld->gallivm;
427 LLVMBuilderRef builder = gallivm->builder;
428 LLVMValueRef shuffles[4];
429 LLVMValueRef tmp[4];
430 LLVMValueRef sumtmp[2], shuftmp[2];
431
432 /* lower half of regs */
433 shuffles[0] = lp_build_const_int32(gallivm, 0);
434 shuffles[1] = lp_build_const_int32(gallivm, 1);
435 shuffles[2] = lp_build_const_int32(gallivm, 4);
436 shuffles[3] = lp_build_const_int32(gallivm, 5);
437 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
438 LLVMConstVector(shuffles, 4), "");
439 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
440 LLVMConstVector(shuffles, 4), "");
441
442 /* upper half of regs */
443 shuffles[0] = lp_build_const_int32(gallivm, 2);
444 shuffles[1] = lp_build_const_int32(gallivm, 3);
445 shuffles[2] = lp_build_const_int32(gallivm, 6);
446 shuffles[3] = lp_build_const_int32(gallivm, 7);
447 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
448 LLVMConstVector(shuffles, 4), "");
449 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
450 LLVMConstVector(shuffles, 4), "");
451
452 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
453 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
454
455 shuffles[0] = lp_build_const_int32(gallivm, 0);
456 shuffles[1] = lp_build_const_int32(gallivm, 2);
457 shuffles[2] = lp_build_const_int32(gallivm, 4);
458 shuffles[3] = lp_build_const_int32(gallivm, 6);
459 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
460 LLVMConstVector(shuffles, 4), "");
461
462 shuffles[0] = lp_build_const_int32(gallivm, 1);
463 shuffles[1] = lp_build_const_int32(gallivm, 3);
464 shuffles[2] = lp_build_const_int32(gallivm, 5);
465 shuffles[3] = lp_build_const_int32(gallivm, 7);
466 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
467 LLVMConstVector(shuffles, 4), "");
468
469 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
470 }
471
472
473 /*
474 * partially horizontally add 2-4 float vectors with length nx4,
475 * i.e. only four adjacent values in each vector will be added,
476 * assuming values are really grouped in 4 which also determines
477 * output order.
478 *
479 * Return a vector of the same length as the initial vectors,
480 * with the excess elements (if any) being undefined.
481 * The element order is independent of number of input vectors.
482 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
483 * the output order thus will be
484 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
485 */
486 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)487 lp_build_hadd_partial4(struct lp_build_context *bld,
488 LLVMValueRef vectors[],
489 unsigned num_vecs)
490 {
491 struct gallivm_state *gallivm = bld->gallivm;
492 LLVMBuilderRef builder = gallivm->builder;
493 LLVMValueRef ret_vec;
494 LLVMValueRef tmp[4];
495 const char *intrinsic = NULL;
496
497 assert(num_vecs >= 2 && num_vecs <= 4);
498 assert(bld->type.floating);
499
500 /* only use this with at least 2 vectors, as it is sort of expensive
501 * (depending on cpu) and we always need two horizontal adds anyway,
502 * so a shuffle/add approach might be better.
503 */
504
505 tmp[0] = vectors[0];
506 tmp[1] = vectors[1];
507
508 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
509 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
510
511 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
512 bld->type.length == 4) {
513 intrinsic = "llvm.x86.sse3.hadd.ps";
514 }
515 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
516 bld->type.length == 8) {
517 intrinsic = "llvm.x86.avx.hadd.ps.256";
518 }
519 if (intrinsic) {
520 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
521 lp_build_vec_type(gallivm, bld->type),
522 tmp[0], tmp[1]);
523 if (num_vecs > 2) {
524 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
525 lp_build_vec_type(gallivm, bld->type),
526 tmp[2], tmp[3]);
527 }
528 else {
529 tmp[1] = tmp[0];
530 }
531 return lp_build_intrinsic_binary(builder, intrinsic,
532 lp_build_vec_type(gallivm, bld->type),
533 tmp[0], tmp[1]);
534 }
535
536 if (bld->type.length == 4) {
537 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
538 }
539 else {
540 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
541 unsigned j;
542 unsigned num_iter = bld->type.length / 4;
543 struct lp_type parttype = bld->type;
544 parttype.length = 4;
545 for (j = 0; j < num_iter; j++) {
546 LLVMValueRef partsrc[4];
547 unsigned i;
548 for (i = 0; i < 4; i++) {
549 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
550 }
551 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
552 }
553 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
554 }
555 return ret_vec;
556 }
557
558 /**
559 * Generate a - b
560 */
561 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)562 lp_build_sub(struct lp_build_context *bld,
563 LLVMValueRef a,
564 LLVMValueRef b)
565 {
566 LLVMBuilderRef builder = bld->gallivm->builder;
567 const struct lp_type type = bld->type;
568 LLVMValueRef res;
569
570 assert(lp_check_value(type, a));
571 assert(lp_check_value(type, b));
572
573 if(b == bld->zero)
574 return a;
575 if(a == bld->undef || b == bld->undef)
576 return bld->undef;
577 if(a == b)
578 return bld->zero;
579
580 if(bld->type.norm) {
581 const char *intrinsic = NULL;
582
583 if(b == bld->one)
584 return bld->zero;
585
586 if(util_cpu_caps.has_sse2 &&
587 type.width * type.length == 128 &&
588 !type.floating && !type.fixed) {
589 if(type.width == 8)
590 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
591 if(type.width == 16)
592 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
593 }
594
595 if(intrinsic)
596 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
597 }
598
599 if(LLVMIsConstant(a) && LLVMIsConstant(b))
600 if (type.floating)
601 res = LLVMConstFSub(a, b);
602 else
603 res = LLVMConstSub(a, b);
604 else
605 if (type.floating)
606 res = LLVMBuildFSub(builder, a, b, "");
607 else
608 res = LLVMBuildSub(builder, a, b, "");
609
610 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
611 res = lp_build_max_simple(bld, res, bld->zero);
612
613 return res;
614 }
615
616
617 /**
618 * Normalized 8bit multiplication.
619 *
620 * - alpha plus one
621 *
622 * makes the following approximation to the division (Sree)
623 *
624 * a*b/255 ~= (a*(b + 1)) >> 256
625 *
626 * which is the fastest method that satisfies the following OpenGL criteria
627 *
628 * 0*0 = 0 and 255*255 = 255
629 *
630 * - geometric series
631 *
632 * takes the geometric series approximation to the division
633 *
634 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
635 *
636 * in this case just the first two terms to fit in 16bit arithmetic
637 *
638 * t/255 ~= (t + (t >> 8)) >> 8
639 *
640 * note that just by itself it doesn't satisfies the OpenGL criteria, as
641 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
642 * must be used
643 *
644 * - geometric series plus rounding
645 *
646 * when using a geometric series division instead of truncating the result
647 * use roundoff in the approximation (Jim Blinn)
648 *
649 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
650 *
651 * achieving the exact results
652 *
653 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
654 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
655 * @sa Michael Herf, The "double blend trick", May 2000,
656 * http://www.stereopsis.com/doubleblend.html
657 */
658 static LLVMValueRef
lp_build_mul_u8n(struct gallivm_state * gallivm,struct lp_type i16_type,LLVMValueRef a,LLVMValueRef b)659 lp_build_mul_u8n(struct gallivm_state *gallivm,
660 struct lp_type i16_type,
661 LLVMValueRef a, LLVMValueRef b)
662 {
663 LLVMBuilderRef builder = gallivm->builder;
664 LLVMValueRef c8;
665 LLVMValueRef ab;
666
667 assert(!i16_type.floating);
668 assert(lp_check_value(i16_type, a));
669 assert(lp_check_value(i16_type, b));
670
671 c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
672
673 #if 0
674
675 /* a*b/255 ~= (a*(b + 1)) >> 256 */
676 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
677 ab = LLVMBuildMul(builder, a, b, "");
678
679 #else
680
681 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
682 ab = LLVMBuildMul(builder, a, b, "");
683 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
684 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
685
686 #endif
687
688 ab = LLVMBuildLShr(builder, ab, c8, "");
689
690 return ab;
691 }
692
693
694 /**
695 * Generate a * b
696 */
697 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)698 lp_build_mul(struct lp_build_context *bld,
699 LLVMValueRef a,
700 LLVMValueRef b)
701 {
702 LLVMBuilderRef builder = bld->gallivm->builder;
703 const struct lp_type type = bld->type;
704 LLVMValueRef shift;
705 LLVMValueRef res;
706
707 assert(lp_check_value(type, a));
708 assert(lp_check_value(type, b));
709
710 if(a == bld->zero)
711 return bld->zero;
712 if(a == bld->one)
713 return b;
714 if(b == bld->zero)
715 return bld->zero;
716 if(b == bld->one)
717 return a;
718 if(a == bld->undef || b == bld->undef)
719 return bld->undef;
720
721 if(!type.floating && !type.fixed && type.norm) {
722 if(type.width == 8) {
723 struct lp_type i16_type = lp_wider_type(type);
724 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
725
726 lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
727 lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
728
729 /* PMULLW, PSRLW, PADDW */
730 abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
731 abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
732
733 ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
734
735 return ab;
736 }
737
738 /* FIXME */
739 assert(0);
740 }
741
742 if(type.fixed)
743 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
744 else
745 shift = NULL;
746
747 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
748 if (type.floating)
749 res = LLVMConstFMul(a, b);
750 else
751 res = LLVMConstMul(a, b);
752 if(shift) {
753 if(type.sign)
754 res = LLVMConstAShr(res, shift);
755 else
756 res = LLVMConstLShr(res, shift);
757 }
758 }
759 else {
760 if (type.floating)
761 res = LLVMBuildFMul(builder, a, b, "");
762 else
763 res = LLVMBuildMul(builder, a, b, "");
764 if(shift) {
765 if(type.sign)
766 res = LLVMBuildAShr(builder, res, shift, "");
767 else
768 res = LLVMBuildLShr(builder, res, shift, "");
769 }
770 }
771
772 return res;
773 }
774
775
776 /**
777 * Small vector x scale multiplication optimization.
778 */
779 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)780 lp_build_mul_imm(struct lp_build_context *bld,
781 LLVMValueRef a,
782 int b)
783 {
784 LLVMBuilderRef builder = bld->gallivm->builder;
785 LLVMValueRef factor;
786
787 assert(lp_check_value(bld->type, a));
788
789 if(b == 0)
790 return bld->zero;
791
792 if(b == 1)
793 return a;
794
795 if(b == -1)
796 return lp_build_negate(bld, a);
797
798 if(b == 2 && bld->type.floating)
799 return lp_build_add(bld, a, a);
800
801 if(util_is_power_of_two(b)) {
802 unsigned shift = ffs(b) - 1;
803
804 if(bld->type.floating) {
805 #if 0
806 /*
807 * Power of two multiplication by directly manipulating the exponent.
808 *
809 * XXX: This might not be always faster, it will introduce a small error
810 * for multiplication by zero, and it will produce wrong results
811 * for Inf and NaN.
812 */
813 unsigned mantissa = lp_mantissa(bld->type);
814 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
815 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
816 a = LLVMBuildAdd(builder, a, factor, "");
817 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
818 return a;
819 #endif
820 }
821 else {
822 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
823 return LLVMBuildShl(builder, a, factor, "");
824 }
825 }
826
827 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
828 return lp_build_mul(bld, a, factor);
829 }
830
831
832 /**
833 * Generate a / b
834 */
835 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)836 lp_build_div(struct lp_build_context *bld,
837 LLVMValueRef a,
838 LLVMValueRef b)
839 {
840 LLVMBuilderRef builder = bld->gallivm->builder;
841 const struct lp_type type = bld->type;
842
843 assert(lp_check_value(type, a));
844 assert(lp_check_value(type, b));
845
846 if(a == bld->zero)
847 return bld->zero;
848 if(a == bld->one)
849 return lp_build_rcp(bld, b);
850 if(b == bld->zero)
851 return bld->undef;
852 if(b == bld->one)
853 return a;
854 if(a == bld->undef || b == bld->undef)
855 return bld->undef;
856
857 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
858 if (type.floating)
859 return LLVMConstFDiv(a, b);
860 else if (type.sign)
861 return LLVMConstSDiv(a, b);
862 else
863 return LLVMConstUDiv(a, b);
864 }
865
866 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
867 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
868 type.floating)
869 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
870
871 if (type.floating)
872 return LLVMBuildFDiv(builder, a, b, "");
873 else if (type.sign)
874 return LLVMBuildSDiv(builder, a, b, "");
875 else
876 return LLVMBuildUDiv(builder, a, b, "");
877 }
878
879
880 /**
881 * Linear interpolation -- without any checks.
882 *
883 * @sa http://www.stereopsis.com/doubleblend.html
884 */
885 static INLINE LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1)886 lp_build_lerp_simple(struct lp_build_context *bld,
887 LLVMValueRef x,
888 LLVMValueRef v0,
889 LLVMValueRef v1)
890 {
891 LLVMBuilderRef builder = bld->gallivm->builder;
892 LLVMValueRef delta;
893 LLVMValueRef res;
894
895 assert(lp_check_value(bld->type, x));
896 assert(lp_check_value(bld->type, v0));
897 assert(lp_check_value(bld->type, v1));
898
899 delta = lp_build_sub(bld, v1, v0);
900
901 res = lp_build_mul(bld, x, delta);
902
903 res = lp_build_add(bld, v0, res);
904
905 if (bld->type.fixed) {
906 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
907 * but it will be wrong for other uses. Basically we need a more
908 * powerful lp_type, capable of further distinguishing the values
909 * interpretation from the value storage. */
910 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
911 }
912
913 return res;
914 }
915
916
917 /**
918 * Linear interpolation.
919 */
920 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1)921 lp_build_lerp(struct lp_build_context *bld,
922 LLVMValueRef x,
923 LLVMValueRef v0,
924 LLVMValueRef v1)
925 {
926 LLVMBuilderRef builder = bld->gallivm->builder;
927 const struct lp_type type = bld->type;
928 LLVMValueRef res;
929
930 assert(lp_check_value(type, x));
931 assert(lp_check_value(type, v0));
932 assert(lp_check_value(type, v1));
933
934 if (type.norm) {
935 struct lp_type wide_type;
936 struct lp_build_context wide_bld;
937 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
938 LLVMValueRef shift;
939
940 assert(type.length >= 2);
941 assert(!type.sign);
942
943 /*
944 * Create a wider type, enough to hold the intermediate result of the
945 * multiplication.
946 */
947 memset(&wide_type, 0, sizeof wide_type);
948 wide_type.fixed = TRUE;
949 wide_type.width = type.width*2;
950 wide_type.length = type.length/2;
951
952 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
953
954 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
955 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
956 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
957
958 /*
959 * Scale x from [0, 255] to [0, 256]
960 */
961
962 shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
963
964 xl = lp_build_add(&wide_bld, xl,
965 LLVMBuildAShr(builder, xl, shift, ""));
966 xh = lp_build_add(&wide_bld, xh,
967 LLVMBuildAShr(builder, xh, shift, ""));
968
969 /*
970 * Lerp both halves.
971 */
972
973 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
974 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
975
976 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
977 } else {
978 res = lp_build_lerp_simple(bld, x, v0, v1);
979 }
980
981 return res;
982 }
983
984
985 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11)986 lp_build_lerp_2d(struct lp_build_context *bld,
987 LLVMValueRef x,
988 LLVMValueRef y,
989 LLVMValueRef v00,
990 LLVMValueRef v01,
991 LLVMValueRef v10,
992 LLVMValueRef v11)
993 {
994 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
995 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
996 return lp_build_lerp(bld, y, v0, v1);
997 }
998
999
1000 /**
1001 * Generate min(a, b)
1002 * Do checks for special cases.
1003 */
1004 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1005 lp_build_min(struct lp_build_context *bld,
1006 LLVMValueRef a,
1007 LLVMValueRef b)
1008 {
1009 assert(lp_check_value(bld->type, a));
1010 assert(lp_check_value(bld->type, b));
1011
1012 if(a == bld->undef || b == bld->undef)
1013 return bld->undef;
1014
1015 if(a == b)
1016 return a;
1017
1018 if (bld->type.norm) {
1019 if (!bld->type.sign) {
1020 if (a == bld->zero || b == bld->zero) {
1021 return bld->zero;
1022 }
1023 }
1024 if(a == bld->one)
1025 return b;
1026 if(b == bld->one)
1027 return a;
1028 }
1029
1030 return lp_build_min_simple(bld, a, b);
1031 }
1032
1033
1034 /**
1035 * Generate max(a, b)
1036 * Do checks for special cases.
1037 */
1038 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1039 lp_build_max(struct lp_build_context *bld,
1040 LLVMValueRef a,
1041 LLVMValueRef b)
1042 {
1043 assert(lp_check_value(bld->type, a));
1044 assert(lp_check_value(bld->type, b));
1045
1046 if(a == bld->undef || b == bld->undef)
1047 return bld->undef;
1048
1049 if(a == b)
1050 return a;
1051
1052 if(bld->type.norm) {
1053 if(a == bld->one || b == bld->one)
1054 return bld->one;
1055 if (!bld->type.sign) {
1056 if (a == bld->zero) {
1057 return b;
1058 }
1059 if (b == bld->zero) {
1060 return a;
1061 }
1062 }
1063 }
1064
1065 return lp_build_max_simple(bld, a, b);
1066 }
1067
1068
1069 /**
1070 * Generate clamp(a, min, max)
1071 * Do checks for special cases.
1072 */
1073 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1074 lp_build_clamp(struct lp_build_context *bld,
1075 LLVMValueRef a,
1076 LLVMValueRef min,
1077 LLVMValueRef max)
1078 {
1079 assert(lp_check_value(bld->type, a));
1080 assert(lp_check_value(bld->type, min));
1081 assert(lp_check_value(bld->type, max));
1082
1083 a = lp_build_min(bld, a, max);
1084 a = lp_build_max(bld, a, min);
1085 return a;
1086 }
1087
1088
1089 /**
1090 * Generate abs(a)
1091 */
1092 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1093 lp_build_abs(struct lp_build_context *bld,
1094 LLVMValueRef a)
1095 {
1096 LLVMBuilderRef builder = bld->gallivm->builder;
1097 const struct lp_type type = bld->type;
1098 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1099
1100 assert(lp_check_value(type, a));
1101
1102 if(!type.sign)
1103 return a;
1104
1105 if(type.floating) {
1106 /* Mask out the sign bit */
1107 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1108 unsigned long long absMask = ~(1ULL << (type.width - 1));
1109 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1110 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1111 a = LLVMBuildAnd(builder, a, mask, "");
1112 a = LLVMBuildBitCast(builder, a, vec_type, "");
1113 return a;
1114 }
1115
1116 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1117 switch(type.width) {
1118 case 8:
1119 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1120 case 16:
1121 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1122 case 32:
1123 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1124 }
1125 }
1126 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1127 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1128 (type.width == 8 || type.width == 16 || type.width == 32)) {
1129 debug_printf("%s: inefficient code, should split vectors manually\n",
1130 __FUNCTION__);
1131 }
1132
1133 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1134 }
1135
1136
1137 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1138 lp_build_negate(struct lp_build_context *bld,
1139 LLVMValueRef a)
1140 {
1141 LLVMBuilderRef builder = bld->gallivm->builder;
1142
1143 assert(lp_check_value(bld->type, a));
1144
1145 #if HAVE_LLVM >= 0x0207
1146 if (bld->type.floating)
1147 a = LLVMBuildFNeg(builder, a, "");
1148 else
1149 #endif
1150 a = LLVMBuildNeg(builder, a, "");
1151
1152 return a;
1153 }
1154
1155
1156 /** Return -1, 0 or +1 depending on the sign of a */
1157 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1158 lp_build_sgn(struct lp_build_context *bld,
1159 LLVMValueRef a)
1160 {
1161 LLVMBuilderRef builder = bld->gallivm->builder;
1162 const struct lp_type type = bld->type;
1163 LLVMValueRef cond;
1164 LLVMValueRef res;
1165
1166 assert(lp_check_value(type, a));
1167
1168 /* Handle non-zero case */
1169 if(!type.sign) {
1170 /* if not zero then sign must be positive */
1171 res = bld->one;
1172 }
1173 else if(type.floating) {
1174 LLVMTypeRef vec_type;
1175 LLVMTypeRef int_type;
1176 LLVMValueRef mask;
1177 LLVMValueRef sign;
1178 LLVMValueRef one;
1179 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1180
1181 int_type = lp_build_int_vec_type(bld->gallivm, type);
1182 vec_type = lp_build_vec_type(bld->gallivm, type);
1183 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1184
1185 /* Take the sign bit and add it to 1 constant */
1186 sign = LLVMBuildBitCast(builder, a, int_type, "");
1187 sign = LLVMBuildAnd(builder, sign, mask, "");
1188 one = LLVMConstBitCast(bld->one, int_type);
1189 res = LLVMBuildOr(builder, sign, one, "");
1190 res = LLVMBuildBitCast(builder, res, vec_type, "");
1191 }
1192 else
1193 {
1194 /* signed int/norm/fixed point */
1195 /* could use psign with sse3 and appropriate vectors here */
1196 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1197 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1198 res = lp_build_select(bld, cond, bld->one, minus_one);
1199 }
1200
1201 /* Handle zero */
1202 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1203 res = lp_build_select(bld, cond, bld->zero, res);
1204
1205 return res;
1206 }
1207
1208
1209 /**
1210 * Set the sign of float vector 'a' according to 'sign'.
1211 * If sign==0, return abs(a).
1212 * If sign==1, return -abs(a);
1213 * Other values for sign produce undefined results.
1214 */
1215 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1216 lp_build_set_sign(struct lp_build_context *bld,
1217 LLVMValueRef a, LLVMValueRef sign)
1218 {
1219 LLVMBuilderRef builder = bld->gallivm->builder;
1220 const struct lp_type type = bld->type;
1221 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1222 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1223 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1224 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1225 ~((unsigned long long) 1 << (type.width - 1)));
1226 LLVMValueRef val, res;
1227
1228 assert(type.floating);
1229 assert(lp_check_value(type, a));
1230
1231 /* val = reinterpret_cast<int>(a) */
1232 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1233 /* val = val & mask */
1234 val = LLVMBuildAnd(builder, val, mask, "");
1235 /* sign = sign << shift */
1236 sign = LLVMBuildShl(builder, sign, shift, "");
1237 /* res = val | sign */
1238 res = LLVMBuildOr(builder, val, sign, "");
1239 /* res = reinterpret_cast<float>(res) */
1240 res = LLVMBuildBitCast(builder, res, vec_type, "");
1241
1242 return res;
1243 }
1244
1245
1246 /**
1247 * Convert vector of (or scalar) int to vector of (or scalar) float.
1248 */
1249 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1250 lp_build_int_to_float(struct lp_build_context *bld,
1251 LLVMValueRef a)
1252 {
1253 LLVMBuilderRef builder = bld->gallivm->builder;
1254 const struct lp_type type = bld->type;
1255 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1256
1257 assert(type.floating);
1258
1259 return LLVMBuildSIToFP(builder, a, vec_type, "");
1260 }
1261
1262 static boolean
sse41_rounding_available(const struct lp_type type)1263 sse41_rounding_available(const struct lp_type type)
1264 {
1265 if ((util_cpu_caps.has_sse4_1 &&
1266 (type.length == 1 || type.width*type.length == 128)) ||
1267 (util_cpu_caps.has_avx && type.width*type.length == 256))
1268 return TRUE;
1269
1270 return FALSE;
1271 }
1272
1273 enum lp_build_round_sse41_mode
1274 {
1275 LP_BUILD_ROUND_SSE41_NEAREST = 0,
1276 LP_BUILD_ROUND_SSE41_FLOOR = 1,
1277 LP_BUILD_ROUND_SSE41_CEIL = 2,
1278 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1279 };
1280
1281
1282 /**
1283 * Helper for SSE4.1's ROUNDxx instructions.
1284 *
1285 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1286 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1287 */
1288 static INLINE LLVMValueRef
lp_build_round_sse41(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_sse41_mode mode)1289 lp_build_round_sse41(struct lp_build_context *bld,
1290 LLVMValueRef a,
1291 enum lp_build_round_sse41_mode mode)
1292 {
1293 LLVMBuilderRef builder = bld->gallivm->builder;
1294 const struct lp_type type = bld->type;
1295 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1296 const char *intrinsic;
1297 LLVMValueRef res;
1298
1299 assert(type.floating);
1300
1301 assert(lp_check_value(type, a));
1302 assert(util_cpu_caps.has_sse4_1);
1303
1304 if (type.length == 1) {
1305 LLVMTypeRef vec_type;
1306 LLVMValueRef undef;
1307 LLVMValueRef args[3];
1308 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1309
1310 switch(type.width) {
1311 case 32:
1312 intrinsic = "llvm.x86.sse41.round.ss";
1313 break;
1314 case 64:
1315 intrinsic = "llvm.x86.sse41.round.sd";
1316 break;
1317 default:
1318 assert(0);
1319 return bld->undef;
1320 }
1321
1322 vec_type = LLVMVectorType(bld->elem_type, 4);
1323
1324 undef = LLVMGetUndef(vec_type);
1325
1326 args[0] = undef;
1327 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1328 args[2] = LLVMConstInt(i32t, mode, 0);
1329
1330 res = lp_build_intrinsic(builder, intrinsic,
1331 vec_type, args, Elements(args));
1332
1333 res = LLVMBuildExtractElement(builder, res, index0, "");
1334 }
1335 else {
1336 if (type.width * type.length == 128) {
1337 switch(type.width) {
1338 case 32:
1339 intrinsic = "llvm.x86.sse41.round.ps";
1340 break;
1341 case 64:
1342 intrinsic = "llvm.x86.sse41.round.pd";
1343 break;
1344 default:
1345 assert(0);
1346 return bld->undef;
1347 }
1348 }
1349 else {
1350 assert(type.width * type.length == 256);
1351 assert(util_cpu_caps.has_avx);
1352
1353 switch(type.width) {
1354 case 32:
1355 intrinsic = "llvm.x86.avx.round.ps.256";
1356 break;
1357 case 64:
1358 intrinsic = "llvm.x86.avx.round.pd.256";
1359 break;
1360 default:
1361 assert(0);
1362 return bld->undef;
1363 }
1364 }
1365
1366 res = lp_build_intrinsic_binary(builder, intrinsic,
1367 bld->vec_type, a,
1368 LLVMConstInt(i32t, mode, 0));
1369 }
1370
1371 return res;
1372 }
1373
1374
1375 static INLINE LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1376 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1377 LLVMValueRef a)
1378 {
1379 LLVMBuilderRef builder = bld->gallivm->builder;
1380 const struct lp_type type = bld->type;
1381 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1382 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1383 const char *intrinsic;
1384 LLVMValueRef res;
1385
1386 assert(type.floating);
1387 /* using the double precision conversions is a bit more complicated */
1388 assert(type.width == 32);
1389
1390 assert(lp_check_value(type, a));
1391 assert(util_cpu_caps.has_sse2);
1392
1393 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1394 if (type.length == 1) {
1395 LLVMTypeRef vec_type;
1396 LLVMValueRef undef;
1397 LLVMValueRef arg;
1398 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1399
1400 vec_type = LLVMVectorType(bld->elem_type, 4);
1401
1402 intrinsic = "llvm.x86.sse.cvtss2si";
1403
1404 undef = LLVMGetUndef(vec_type);
1405
1406 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1407
1408 res = lp_build_intrinsic_unary(builder, intrinsic,
1409 ret_type, arg);
1410 }
1411 else {
1412 if (type.width* type.length == 128) {
1413 intrinsic = "llvm.x86.sse2.cvtps2dq";
1414 }
1415 else {
1416 assert(type.width*type.length == 256);
1417 assert(util_cpu_caps.has_avx);
1418
1419 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1420 }
1421 res = lp_build_intrinsic_unary(builder, intrinsic,
1422 ret_type, a);
1423 }
1424
1425 return res;
1426 }
1427
1428
1429 /**
1430 * Return the integer part of a float (vector) value (== round toward zero).
1431 * The returned value is a float (vector).
1432 * Ex: trunc(-1.5) = -1.0
1433 */
1434 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)1435 lp_build_trunc(struct lp_build_context *bld,
1436 LLVMValueRef a)
1437 {
1438 LLVMBuilderRef builder = bld->gallivm->builder;
1439 const struct lp_type type = bld->type;
1440
1441 assert(type.floating);
1442 assert(lp_check_value(type, a));
1443
1444 if (sse41_rounding_available(type)) {
1445 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1446 }
1447 else {
1448 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1449 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1450 LLVMValueRef res;
1451 res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1452 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1453 return res;
1454 }
1455 }
1456
1457
1458 /**
1459 * Return float (vector) rounded to nearest integer (vector). The returned
1460 * value is a float (vector).
1461 * Ex: round(0.9) = 1.0
1462 * Ex: round(-1.5) = -2.0
1463 */
1464 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)1465 lp_build_round(struct lp_build_context *bld,
1466 LLVMValueRef a)
1467 {
1468 LLVMBuilderRef builder = bld->gallivm->builder;
1469 const struct lp_type type = bld->type;
1470
1471 assert(type.floating);
1472 assert(lp_check_value(type, a));
1473
1474 if (sse41_rounding_available(type)) {
1475 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1476 }
1477 else {
1478 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1479 LLVMValueRef res;
1480 res = lp_build_iround(bld, a);
1481 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1482 return res;
1483 }
1484 }
1485
1486
1487 /**
1488 * Return floor of float (vector), result is a float (vector)
1489 * Ex: floor(1.1) = 1.0
1490 * Ex: floor(-1.1) = -2.0
1491 */
1492 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)1493 lp_build_floor(struct lp_build_context *bld,
1494 LLVMValueRef a)
1495 {
1496 LLVMBuilderRef builder = bld->gallivm->builder;
1497 const struct lp_type type = bld->type;
1498
1499 assert(type.floating);
1500 assert(lp_check_value(type, a));
1501
1502 if (sse41_rounding_available(type)) {
1503 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1504 }
1505 else {
1506 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1507 LLVMValueRef res;
1508 res = lp_build_ifloor(bld, a);
1509 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1510 return res;
1511 }
1512 }
1513
1514
1515 /**
1516 * Return ceiling of float (vector), returning float (vector).
1517 * Ex: ceil( 1.1) = 2.0
1518 * Ex: ceil(-1.1) = -1.0
1519 */
1520 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)1521 lp_build_ceil(struct lp_build_context *bld,
1522 LLVMValueRef a)
1523 {
1524 LLVMBuilderRef builder = bld->gallivm->builder;
1525 const struct lp_type type = bld->type;
1526
1527 assert(type.floating);
1528 assert(lp_check_value(type, a));
1529
1530 if (sse41_rounding_available(type)) {
1531 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1532 }
1533 else {
1534 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1535 LLVMValueRef res;
1536 res = lp_build_iceil(bld, a);
1537 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1538 return res;
1539 }
1540 }
1541
1542
1543 /**
1544 * Return fractional part of 'a' computed as a - floor(a)
1545 * Typically used in texture coord arithmetic.
1546 */
1547 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)1548 lp_build_fract(struct lp_build_context *bld,
1549 LLVMValueRef a)
1550 {
1551 assert(bld->type.floating);
1552 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1553 }
1554
1555
1556 /**
1557 * Prevent returning a fractional part of 1.0 for very small negative values of
1558 * 'a' by clamping against 0.99999(9).
1559 */
1560 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)1561 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1562 {
1563 LLVMValueRef max;
1564
1565 /* this is the largest number smaller than 1.0 representable as float */
1566 max = lp_build_const_vec(bld->gallivm, bld->type,
1567 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1568 return lp_build_min(bld, fract, max);
1569 }
1570
1571
1572 /**
1573 * Same as lp_build_fract, but guarantees that the result is always smaller
1574 * than one.
1575 */
1576 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)1577 lp_build_fract_safe(struct lp_build_context *bld,
1578 LLVMValueRef a)
1579 {
1580 return clamp_fract(bld, lp_build_fract(bld, a));
1581 }
1582
1583
1584 /**
1585 * Return the integer part of a float (vector) value (== round toward zero).
1586 * The returned value is an integer (vector).
1587 * Ex: itrunc(-1.5) = -1
1588 */
1589 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)1590 lp_build_itrunc(struct lp_build_context *bld,
1591 LLVMValueRef a)
1592 {
1593 LLVMBuilderRef builder = bld->gallivm->builder;
1594 const struct lp_type type = bld->type;
1595 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1596
1597 assert(type.floating);
1598 assert(lp_check_value(type, a));
1599
1600 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1601 }
1602
1603
1604 /**
1605 * Return float (vector) rounded to nearest integer (vector). The returned
1606 * value is an integer (vector).
1607 * Ex: iround(0.9) = 1
1608 * Ex: iround(-1.5) = -2
1609 */
1610 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)1611 lp_build_iround(struct lp_build_context *bld,
1612 LLVMValueRef a)
1613 {
1614 LLVMBuilderRef builder = bld->gallivm->builder;
1615 const struct lp_type type = bld->type;
1616 LLVMTypeRef int_vec_type = bld->int_vec_type;
1617 LLVMValueRef res;
1618
1619 assert(type.floating);
1620
1621 assert(lp_check_value(type, a));
1622
1623 if ((util_cpu_caps.has_sse2 &&
1624 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1625 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1626 return lp_build_iround_nearest_sse2(bld, a);
1627 }
1628 if (sse41_rounding_available(type)) {
1629 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1630 }
1631 else {
1632 LLVMValueRef half;
1633
1634 half = lp_build_const_vec(bld->gallivm, type, 0.5);
1635
1636 if (type.sign) {
1637 LLVMTypeRef vec_type = bld->vec_type;
1638 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1639 (unsigned long long)1 << (type.width - 1));
1640 LLVMValueRef sign;
1641
1642 /* get sign bit */
1643 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1644 sign = LLVMBuildAnd(builder, sign, mask, "");
1645
1646 /* sign * 0.5 */
1647 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1648 half = LLVMBuildOr(builder, sign, half, "");
1649 half = LLVMBuildBitCast(builder, half, vec_type, "");
1650 }
1651
1652 res = LLVMBuildFAdd(builder, a, half, "");
1653 }
1654
1655 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1656
1657 return res;
1658 }
1659
1660
1661 /**
1662 * Return floor of float (vector), result is an int (vector)
1663 * Ex: ifloor(1.1) = 1.0
1664 * Ex: ifloor(-1.1) = -2.0
1665 */
1666 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)1667 lp_build_ifloor(struct lp_build_context *bld,
1668 LLVMValueRef a)
1669 {
1670 LLVMBuilderRef builder = bld->gallivm->builder;
1671 const struct lp_type type = bld->type;
1672 LLVMTypeRef int_vec_type = bld->int_vec_type;
1673 LLVMValueRef res;
1674
1675 assert(type.floating);
1676 assert(lp_check_value(type, a));
1677
1678 res = a;
1679 if (type.sign) {
1680 if (sse41_rounding_available(type)) {
1681 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1682 }
1683 else {
1684 /* Take the sign bit and add it to 1 constant */
1685 LLVMTypeRef vec_type = bld->vec_type;
1686 unsigned mantissa = lp_mantissa(type);
1687 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1688 (unsigned long long)1 << (type.width - 1));
1689 LLVMValueRef sign;
1690 LLVMValueRef offset;
1691
1692 /* sign = a < 0 ? ~0 : 0 */
1693 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1694 sign = LLVMBuildAnd(builder, sign, mask, "");
1695 sign = LLVMBuildAShr(builder, sign,
1696 lp_build_const_int_vec(bld->gallivm, type,
1697 type.width - 1),
1698 "ifloor.sign");
1699
1700 /* offset = -0.99999(9)f */
1701 offset = lp_build_const_vec(bld->gallivm, type,
1702 -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1703 offset = LLVMConstBitCast(offset, int_vec_type);
1704
1705 /* offset = a < 0 ? offset : 0.0f */
1706 offset = LLVMBuildAnd(builder, offset, sign, "");
1707 offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1708
1709 res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1710 }
1711 }
1712
1713 /* round to nearest (toward zero) */
1714 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1715
1716 return res;
1717 }
1718
1719
1720 /**
1721 * Return ceiling of float (vector), returning int (vector).
1722 * Ex: iceil( 1.1) = 2
1723 * Ex: iceil(-1.1) = -1
1724 */
1725 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)1726 lp_build_iceil(struct lp_build_context *bld,
1727 LLVMValueRef a)
1728 {
1729 LLVMBuilderRef builder = bld->gallivm->builder;
1730 const struct lp_type type = bld->type;
1731 LLVMTypeRef int_vec_type = bld->int_vec_type;
1732 LLVMValueRef res;
1733
1734 assert(type.floating);
1735 assert(lp_check_value(type, a));
1736
1737 if (sse41_rounding_available(type)) {
1738 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1739 }
1740 else {
1741 LLVMTypeRef vec_type = bld->vec_type;
1742 unsigned mantissa = lp_mantissa(type);
1743 LLVMValueRef offset;
1744
1745 /* offset = 0.99999(9)f */
1746 offset = lp_build_const_vec(bld->gallivm, type,
1747 (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1748
1749 if (type.sign) {
1750 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1751 (unsigned long long)1 << (type.width - 1));
1752 LLVMValueRef sign;
1753
1754 /* sign = a < 0 ? 0 : ~0 */
1755 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1756 sign = LLVMBuildAnd(builder, sign, mask, "");
1757 sign = LLVMBuildAShr(builder, sign,
1758 lp_build_const_int_vec(bld->gallivm, type,
1759 type.width - 1),
1760 "iceil.sign");
1761 sign = LLVMBuildNot(builder, sign, "iceil.not");
1762
1763 /* offset = a < 0 ? 0.0 : offset */
1764 offset = LLVMConstBitCast(offset, int_vec_type);
1765 offset = LLVMBuildAnd(builder, offset, sign, "");
1766 offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1767 }
1768
1769 res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1770 }
1771
1772 /* round to nearest (toward zero) */
1773 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1774
1775 return res;
1776 }
1777
1778
1779 /**
1780 * Combined ifloor() & fract().
1781 *
1782 * Preferred to calling the functions separately, as it will ensure that the
1783 * strategy (floor() vs ifloor()) that results in less redundant work is used.
1784 */
1785 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)1786 lp_build_ifloor_fract(struct lp_build_context *bld,
1787 LLVMValueRef a,
1788 LLVMValueRef *out_ipart,
1789 LLVMValueRef *out_fpart)
1790 {
1791 LLVMBuilderRef builder = bld->gallivm->builder;
1792 const struct lp_type type = bld->type;
1793 LLVMValueRef ipart;
1794
1795 assert(type.floating);
1796 assert(lp_check_value(type, a));
1797
1798 if (sse41_rounding_available(type)) {
1799 /*
1800 * floor() is easier.
1801 */
1802
1803 ipart = lp_build_floor(bld, a);
1804 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1805 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1806 }
1807 else {
1808 /*
1809 * ifloor() is easier.
1810 */
1811
1812 *out_ipart = lp_build_ifloor(bld, a);
1813 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1814 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1815 }
1816 }
1817
1818
1819 /**
1820 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
1821 * always smaller than one.
1822 */
1823 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)1824 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
1825 LLVMValueRef a,
1826 LLVMValueRef *out_ipart,
1827 LLVMValueRef *out_fpart)
1828 {
1829 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
1830 *out_fpart = clamp_fract(bld, *out_fpart);
1831 }
1832
1833
1834 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)1835 lp_build_sqrt(struct lp_build_context *bld,
1836 LLVMValueRef a)
1837 {
1838 LLVMBuilderRef builder = bld->gallivm->builder;
1839 const struct lp_type type = bld->type;
1840 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1841 char intrinsic[32];
1842
1843 assert(lp_check_value(type, a));
1844
1845 /* TODO: optimize the constant case */
1846
1847 assert(type.floating);
1848 if (type.length == 1) {
1849 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
1850 }
1851 else {
1852 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1853 }
1854
1855 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1856 }
1857
1858
1859 /**
1860 * Do one Newton-Raphson step to improve reciprocate precision:
1861 *
1862 * x_{i+1} = x_i * (2 - a * x_i)
1863 *
1864 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1865 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1866 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1867 * halo. It would be necessary to clamp the argument to prevent this.
1868 *
1869 * See also:
1870 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1871 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1872 */
1873 static INLINE LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)1874 lp_build_rcp_refine(struct lp_build_context *bld,
1875 LLVMValueRef a,
1876 LLVMValueRef rcp_a)
1877 {
1878 LLVMBuilderRef builder = bld->gallivm->builder;
1879 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1880 LLVMValueRef res;
1881
1882 res = LLVMBuildFMul(builder, a, rcp_a, "");
1883 res = LLVMBuildFSub(builder, two, res, "");
1884 res = LLVMBuildFMul(builder, rcp_a, res, "");
1885
1886 return res;
1887 }
1888
1889
1890 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)1891 lp_build_rcp(struct lp_build_context *bld,
1892 LLVMValueRef a)
1893 {
1894 LLVMBuilderRef builder = bld->gallivm->builder;
1895 const struct lp_type type = bld->type;
1896
1897 assert(lp_check_value(type, a));
1898
1899 if(a == bld->zero)
1900 return bld->undef;
1901 if(a == bld->one)
1902 return bld->one;
1903 if(a == bld->undef)
1904 return bld->undef;
1905
1906 assert(type.floating);
1907
1908 if(LLVMIsConstant(a))
1909 return LLVMConstFDiv(bld->one, a);
1910
1911 /*
1912 * We don't use RCPPS because:
1913 * - it only has 10bits of precision
1914 * - it doesn't even get the reciprocate of 1.0 exactly
1915 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1916 * - for recent processors the benefit over DIVPS is marginal, a case
1917 * dependent
1918 *
1919 * We could still use it on certain processors if benchmarks show that the
1920 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1921 * particular uses that require less workarounds.
1922 */
1923
1924 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1925 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
1926 const unsigned num_iterations = 0;
1927 LLVMValueRef res;
1928 unsigned i;
1929 const char *intrinsic = NULL;
1930
1931 if (type.length == 4) {
1932 intrinsic = "llvm.x86.sse.rcp.ps";
1933 }
1934 else {
1935 intrinsic = "llvm.x86.avx.rcp.ps.256";
1936 }
1937
1938 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1939
1940 for (i = 0; i < num_iterations; ++i) {
1941 res = lp_build_rcp_refine(bld, a, res);
1942 }
1943
1944 return res;
1945 }
1946
1947 return LLVMBuildFDiv(builder, bld->one, a, "");
1948 }
1949
1950
1951 /**
1952 * Do one Newton-Raphson step to improve rsqrt precision:
1953 *
1954 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1955 *
1956 * See also:
1957 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1958 */
1959 static INLINE LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)1960 lp_build_rsqrt_refine(struct lp_build_context *bld,
1961 LLVMValueRef a,
1962 LLVMValueRef rsqrt_a)
1963 {
1964 LLVMBuilderRef builder = bld->gallivm->builder;
1965 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1966 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1967 LLVMValueRef res;
1968
1969 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1970 res = LLVMBuildFMul(builder, a, res, "");
1971 res = LLVMBuildFSub(builder, three, res, "");
1972 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1973 res = LLVMBuildFMul(builder, half, res, "");
1974
1975 return res;
1976 }
1977
1978
1979 /**
1980 * Generate 1/sqrt(a)
1981 */
1982 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)1983 lp_build_rsqrt(struct lp_build_context *bld,
1984 LLVMValueRef a)
1985 {
1986 LLVMBuilderRef builder = bld->gallivm->builder;
1987 const struct lp_type type = bld->type;
1988
1989 assert(lp_check_value(type, a));
1990
1991 assert(type.floating);
1992
1993 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1994 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1995 const unsigned num_iterations = 1;
1996 LLVMValueRef res;
1997 unsigned i;
1998 const char *intrinsic = NULL;
1999
2000 if (type.length == 4) {
2001 intrinsic = "llvm.x86.sse.rsqrt.ps";
2002 }
2003 else {
2004 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2005 }
2006
2007 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2008
2009
2010 for (i = 0; i < num_iterations; ++i) {
2011 res = lp_build_rsqrt_refine(bld, a, res);
2012 }
2013
2014 return res;
2015 }
2016
2017 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2018 }
2019
2020
2021 /**
2022 * Generate sin(a) using SSE2
2023 */
2024 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)2025 lp_build_sin(struct lp_build_context *bld,
2026 LLVMValueRef a)
2027 {
2028 struct gallivm_state *gallivm = bld->gallivm;
2029 LLVMBuilderRef builder = gallivm->builder;
2030 struct lp_type int_type = lp_int_type(bld->type);
2031 LLVMBuilderRef b = builder;
2032
2033 /*
2034 * take the absolute value,
2035 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2036 */
2037
2038 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2039 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2040
2041 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2042 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2043
2044 /*
2045 * extract the sign bit (upper one)
2046 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2047 */
2048 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2049 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2050
2051 /*
2052 * scale by 4/Pi
2053 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2054 */
2055
2056 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2057 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2058
2059 /*
2060 * store the integer part of y in mm0
2061 * emm2 = _mm_cvttps_epi32(y);
2062 */
2063
2064 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2065
2066 /*
2067 * j=(j+1) & (~1) (see the cephes sources)
2068 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2069 */
2070
2071 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2072 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2073 /*
2074 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2075 */
2076 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2077 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2078
2079 /*
2080 * y = _mm_cvtepi32_ps(emm2);
2081 */
2082 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2083
2084 /* get the swap sign flag
2085 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2086 */
2087 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2088 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2089
2090 /*
2091 * emm2 = _mm_slli_epi32(emm0, 29);
2092 */
2093 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2094 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2095
2096 /*
2097 * get the polynom selection mask
2098 * there is one polynom for 0 <= x <= Pi/4
2099 * and another one for Pi/4<x<=Pi/2
2100 * Both branches will be computed.
2101 *
2102 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2103 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2104 */
2105
2106 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2107 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2108 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2109 int_type, PIPE_FUNC_EQUAL,
2110 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2111 /*
2112 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2113 */
2114 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2115
2116 /*
2117 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2118 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2119 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2120 */
2121 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2122 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2123 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2124
2125 /*
2126 * The magic pass: "Extended precision modular arithmetic"
2127 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2128 * xmm1 = _mm_mul_ps(y, xmm1);
2129 * xmm2 = _mm_mul_ps(y, xmm2);
2130 * xmm3 = _mm_mul_ps(y, xmm3);
2131 */
2132 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2133 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2134 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2135
2136 /*
2137 * x = _mm_add_ps(x, xmm1);
2138 * x = _mm_add_ps(x, xmm2);
2139 * x = _mm_add_ps(x, xmm3);
2140 */
2141
2142 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2143 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2144 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2145
2146 /*
2147 * Evaluate the first polynom (0 <= x <= Pi/4)
2148 *
2149 * z = _mm_mul_ps(x,x);
2150 */
2151 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2152
2153 /*
2154 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2155 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2156 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2157 */
2158 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2159 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2160 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2161
2162 /*
2163 * y = *(v4sf*)_ps_coscof_p0;
2164 * y = _mm_mul_ps(y, z);
2165 */
2166 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2167 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2168 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2169 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2170 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2171 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2172
2173
2174 /*
2175 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2176 * y = _mm_sub_ps(y, tmp);
2177 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2178 */
2179 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2180 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2181 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2182 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2183 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2184
2185 /*
2186 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2187 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2188 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2189 */
2190 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2191 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2192 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2193
2194 /*
2195 * Evaluate the second polynom (Pi/4 <= x <= 0)
2196 *
2197 * y2 = *(v4sf*)_ps_sincof_p0;
2198 * y2 = _mm_mul_ps(y2, z);
2199 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2200 * y2 = _mm_mul_ps(y2, z);
2201 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2202 * y2 = _mm_mul_ps(y2, z);
2203 * y2 = _mm_mul_ps(y2, x);
2204 * y2 = _mm_add_ps(y2, x);
2205 */
2206
2207 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2208 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2209 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2210 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2211 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2212 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2213 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2214
2215 /*
2216 * select the correct result from the two polynoms
2217 * xmm3 = poly_mask;
2218 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2219 * y = _mm_andnot_ps(xmm3, y);
2220 * y = _mm_add_ps(y,y2);
2221 */
2222 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2223 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2224 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2225 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2226 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2227 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2228 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2229
2230 /*
2231 * update the sign
2232 * y = _mm_xor_ps(y, sign_bit);
2233 */
2234 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2235 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2236 return y_result;
2237 }
2238
2239
2240 /**
2241 * Generate cos(a) using SSE2
2242 */
2243 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)2244 lp_build_cos(struct lp_build_context *bld,
2245 LLVMValueRef a)
2246 {
2247 struct gallivm_state *gallivm = bld->gallivm;
2248 LLVMBuilderRef builder = gallivm->builder;
2249 struct lp_type int_type = lp_int_type(bld->type);
2250 LLVMBuilderRef b = builder;
2251
2252 /*
2253 * take the absolute value,
2254 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2255 */
2256
2257 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2258 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2259
2260 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2261 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2262
2263 /*
2264 * scale by 4/Pi
2265 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2266 */
2267
2268 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2269 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2270
2271 /*
2272 * store the integer part of y in mm0
2273 * emm2 = _mm_cvttps_epi32(y);
2274 */
2275
2276 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2277
2278 /*
2279 * j=(j+1) & (~1) (see the cephes sources)
2280 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2281 */
2282
2283 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2284 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2285 /*
2286 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2287 */
2288 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2289 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2290
2291 /*
2292 * y = _mm_cvtepi32_ps(emm2);
2293 */
2294 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2295
2296
2297 /*
2298 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2299 */
2300 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2301 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2302
2303
2304 /* get the swap sign flag
2305 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2306 */
2307 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2308 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2309 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2310 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2311
2312 /*
2313 * emm2 = _mm_slli_epi32(emm0, 29);
2314 */
2315 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2316 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2317
2318 /*
2319 * get the polynom selection mask
2320 * there is one polynom for 0 <= x <= Pi/4
2321 * and another one for Pi/4<x<=Pi/2
2322 * Both branches will be computed.
2323 *
2324 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2325 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2326 */
2327
2328 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2329 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2330 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2331 int_type, PIPE_FUNC_EQUAL,
2332 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2333
2334 /*
2335 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2336 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2337 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2338 */
2339 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2340 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2341 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2342
2343 /*
2344 * The magic pass: "Extended precision modular arithmetic"
2345 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2346 * xmm1 = _mm_mul_ps(y, xmm1);
2347 * xmm2 = _mm_mul_ps(y, xmm2);
2348 * xmm3 = _mm_mul_ps(y, xmm3);
2349 */
2350 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2351 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2352 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2353
2354 /*
2355 * x = _mm_add_ps(x, xmm1);
2356 * x = _mm_add_ps(x, xmm2);
2357 * x = _mm_add_ps(x, xmm3);
2358 */
2359
2360 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2361 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2362 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2363
2364 /*
2365 * Evaluate the first polynom (0 <= x <= Pi/4)
2366 *
2367 * z = _mm_mul_ps(x,x);
2368 */
2369 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2370
2371 /*
2372 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2373 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2374 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2375 */
2376 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2377 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2378 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2379
2380 /*
2381 * y = *(v4sf*)_ps_coscof_p0;
2382 * y = _mm_mul_ps(y, z);
2383 */
2384 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2385 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2386 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2387 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2388 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2389 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2390
2391
2392 /*
2393 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2394 * y = _mm_sub_ps(y, tmp);
2395 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2396 */
2397 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2398 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2399 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2400 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2401 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2402
2403 /*
2404 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2405 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2406 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2407 */
2408 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2409 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2410 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2411
2412 /*
2413 * Evaluate the second polynom (Pi/4 <= x <= 0)
2414 *
2415 * y2 = *(v4sf*)_ps_sincof_p0;
2416 * y2 = _mm_mul_ps(y2, z);
2417 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2418 * y2 = _mm_mul_ps(y2, z);
2419 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2420 * y2 = _mm_mul_ps(y2, z);
2421 * y2 = _mm_mul_ps(y2, x);
2422 * y2 = _mm_add_ps(y2, x);
2423 */
2424
2425 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2426 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2427 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2428 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2429 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2430 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2431 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2432
2433 /*
2434 * select the correct result from the two polynoms
2435 * xmm3 = poly_mask;
2436 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2437 * y = _mm_andnot_ps(xmm3, y);
2438 * y = _mm_add_ps(y,y2);
2439 */
2440 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2441 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2442 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2443 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2444 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2445 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2446
2447 /*
2448 * update the sign
2449 * y = _mm_xor_ps(y, sign_bit);
2450 */
2451 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2452 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2453 return y_result;
2454 }
2455
2456
2457 /**
2458 * Generate pow(x, y)
2459 */
2460 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)2461 lp_build_pow(struct lp_build_context *bld,
2462 LLVMValueRef x,
2463 LLVMValueRef y)
2464 {
2465 /* TODO: optimize the constant case */
2466 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2467 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2468 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2469 __FUNCTION__);
2470 }
2471
2472 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2473 }
2474
2475
2476 /**
2477 * Generate exp(x)
2478 */
2479 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)2480 lp_build_exp(struct lp_build_context *bld,
2481 LLVMValueRef x)
2482 {
2483 /* log2(e) = 1/log(2) */
2484 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2485 1.4426950408889634);
2486
2487 assert(lp_check_value(bld->type, x));
2488
2489 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2490 }
2491
2492
2493 /**
2494 * Generate log(x)
2495 */
2496 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)2497 lp_build_log(struct lp_build_context *bld,
2498 LLVMValueRef x)
2499 {
2500 /* log(2) */
2501 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2502 0.69314718055994529);
2503
2504 assert(lp_check_value(bld->type, x));
2505
2506 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2507 }
2508
2509
2510 /**
2511 * Generate polynomial.
2512 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2513 */
2514 static LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)2515 lp_build_polynomial(struct lp_build_context *bld,
2516 LLVMValueRef x,
2517 const double *coeffs,
2518 unsigned num_coeffs)
2519 {
2520 const struct lp_type type = bld->type;
2521 LLVMValueRef even = NULL, odd = NULL;
2522 LLVMValueRef x2;
2523 unsigned i;
2524
2525 assert(lp_check_value(bld->type, x));
2526
2527 /* TODO: optimize the constant case */
2528 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2529 LLVMIsConstant(x)) {
2530 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2531 __FUNCTION__);
2532 }
2533
2534 /*
2535 * Calculate odd and even terms seperately to decrease data dependency
2536 * Ex:
2537 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2538 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2539 */
2540 x2 = lp_build_mul(bld, x, x);
2541
2542 for (i = num_coeffs; i--; ) {
2543 LLVMValueRef coeff;
2544
2545 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2546
2547 if (i % 2 == 0) {
2548 if (even)
2549 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2550 else
2551 even = coeff;
2552 } else {
2553 if (odd)
2554 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2555 else
2556 odd = coeff;
2557 }
2558 }
2559
2560 if (odd)
2561 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2562 else if (even)
2563 return even;
2564 else
2565 return bld->undef;
2566 }
2567
2568
2569 /**
2570 * Minimax polynomial fit of 2**x, in range [0, 1[
2571 */
2572 const double lp_build_exp2_polynomial[] = {
2573 #if EXP_POLY_DEGREE == 5
2574 0.999999925063526176901,
2575 0.693153073200168932794,
2576 0.240153617044375388211,
2577 0.0558263180532956664775,
2578 0.00898934009049466391101,
2579 0.00187757667519147912699
2580 #elif EXP_POLY_DEGREE == 4
2581 1.00000259337069434683,
2582 0.693003834469974940458,
2583 0.24144275689150793076,
2584 0.0520114606103070150235,
2585 0.0135341679161270268764
2586 #elif EXP_POLY_DEGREE == 3
2587 0.999925218562710312959,
2588 0.695833540494823811697,
2589 0.226067155427249155588,
2590 0.0780245226406372992967
2591 #elif EXP_POLY_DEGREE == 2
2592 1.00172476321474503578,
2593 0.657636275736077639316,
2594 0.33718943461968720704
2595 #else
2596 #error
2597 #endif
2598 };
2599
2600
2601 void
lp_build_exp2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp2_int_part,LLVMValueRef * p_frac_part,LLVMValueRef * p_exp2)2602 lp_build_exp2_approx(struct lp_build_context *bld,
2603 LLVMValueRef x,
2604 LLVMValueRef *p_exp2_int_part,
2605 LLVMValueRef *p_frac_part,
2606 LLVMValueRef *p_exp2)
2607 {
2608 LLVMBuilderRef builder = bld->gallivm->builder;
2609 const struct lp_type type = bld->type;
2610 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2611 LLVMValueRef ipart = NULL;
2612 LLVMValueRef fpart = NULL;
2613 LLVMValueRef expipart = NULL;
2614 LLVMValueRef expfpart = NULL;
2615 LLVMValueRef res = NULL;
2616
2617 assert(lp_check_value(bld->type, x));
2618
2619 if(p_exp2_int_part || p_frac_part || p_exp2) {
2620 /* TODO: optimize the constant case */
2621 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2622 LLVMIsConstant(x)) {
2623 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2624 __FUNCTION__);
2625 }
2626
2627 assert(type.floating && type.width == 32);
2628
2629 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0));
2630 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2631
2632 /* ipart = floor(x) */
2633 /* fpart = x - ipart */
2634 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2635 }
2636
2637 if(p_exp2_int_part || p_exp2) {
2638 /* expipart = (float) (1 << ipart) */
2639 expipart = LLVMBuildAdd(builder, ipart,
2640 lp_build_const_int_vec(bld->gallivm, type, 127), "");
2641 expipart = LLVMBuildShl(builder, expipart,
2642 lp_build_const_int_vec(bld->gallivm, type, 23), "");
2643 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2644 }
2645
2646 if(p_exp2) {
2647 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2648 Elements(lp_build_exp2_polynomial));
2649
2650 res = LLVMBuildFMul(builder, expipart, expfpart, "");
2651 }
2652
2653 if(p_exp2_int_part)
2654 *p_exp2_int_part = expipart;
2655
2656 if(p_frac_part)
2657 *p_frac_part = fpart;
2658
2659 if(p_exp2)
2660 *p_exp2 = res;
2661 }
2662
2663
2664 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)2665 lp_build_exp2(struct lp_build_context *bld,
2666 LLVMValueRef x)
2667 {
2668 LLVMValueRef res;
2669 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2670 return res;
2671 }
2672
2673
2674 /**
2675 * Extract the exponent of a IEEE-754 floating point value.
2676 *
2677 * Optionally apply an integer bias.
2678 *
2679 * Result is an integer value with
2680 *
2681 * ifloor(log2(x)) + bias
2682 */
2683 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)2684 lp_build_extract_exponent(struct lp_build_context *bld,
2685 LLVMValueRef x,
2686 int bias)
2687 {
2688 LLVMBuilderRef builder = bld->gallivm->builder;
2689 const struct lp_type type = bld->type;
2690 unsigned mantissa = lp_mantissa(type);
2691 LLVMValueRef res;
2692
2693 assert(type.floating);
2694
2695 assert(lp_check_value(bld->type, x));
2696
2697 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2698
2699 res = LLVMBuildLShr(builder, x,
2700 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2701 res = LLVMBuildAnd(builder, res,
2702 lp_build_const_int_vec(bld->gallivm, type, 255), "");
2703 res = LLVMBuildSub(builder, res,
2704 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2705
2706 return res;
2707 }
2708
2709
2710 /**
2711 * Extract the mantissa of the a floating.
2712 *
2713 * Result is a floating point value with
2714 *
2715 * x / floor(log2(x))
2716 */
2717 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)2718 lp_build_extract_mantissa(struct lp_build_context *bld,
2719 LLVMValueRef x)
2720 {
2721 LLVMBuilderRef builder = bld->gallivm->builder;
2722 const struct lp_type type = bld->type;
2723 unsigned mantissa = lp_mantissa(type);
2724 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2725 (1ULL << mantissa) - 1);
2726 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2727 LLVMValueRef res;
2728
2729 assert(lp_check_value(bld->type, x));
2730
2731 assert(type.floating);
2732
2733 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2734
2735 /* res = x / 2**ipart */
2736 res = LLVMBuildAnd(builder, x, mantmask, "");
2737 res = LLVMBuildOr(builder, res, one, "");
2738 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2739
2740 return res;
2741 }
2742
2743
2744
2745 /**
2746 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
2747 * These coefficients can be generate with
2748 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2749 */
2750 const double lp_build_log2_polynomial[] = {
2751 #if LOG_POLY_DEGREE == 5
2752 2.88539008148777786488L,
2753 0.961796878841293367824L,
2754 0.577058946784739859012L,
2755 0.412914355135828735411L,
2756 0.308591899232910175289L,
2757 0.352376952300281371868L,
2758 #elif LOG_POLY_DEGREE == 4
2759 2.88539009343309178325L,
2760 0.961791550404184197881L,
2761 0.577440339438736392009L,
2762 0.403343858251329912514L,
2763 0.406718052498846252698L,
2764 #elif LOG_POLY_DEGREE == 3
2765 2.88538959748872753838L,
2766 0.961932915889597772928L,
2767 0.571118517972136195241L,
2768 0.493997535084709500285L,
2769 #else
2770 #error
2771 #endif
2772 };
2773
2774 /**
2775 * See http://www.devmaster.net/forums/showthread.php?p=43580
2776 * http://en.wikipedia.org/wiki/Logarithm#Calculation
2777 * http://www.nezumi.demon.co.uk/consult/logx.htm
2778 */
2779 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2)2780 lp_build_log2_approx(struct lp_build_context *bld,
2781 LLVMValueRef x,
2782 LLVMValueRef *p_exp,
2783 LLVMValueRef *p_floor_log2,
2784 LLVMValueRef *p_log2)
2785 {
2786 LLVMBuilderRef builder = bld->gallivm->builder;
2787 const struct lp_type type = bld->type;
2788 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2789 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2790
2791 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2792 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2793 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2794
2795 LLVMValueRef i = NULL;
2796 LLVMValueRef y = NULL;
2797 LLVMValueRef z = NULL;
2798 LLVMValueRef exp = NULL;
2799 LLVMValueRef mant = NULL;
2800 LLVMValueRef logexp = NULL;
2801 LLVMValueRef logmant = NULL;
2802 LLVMValueRef res = NULL;
2803
2804 assert(lp_check_value(bld->type, x));
2805
2806 if(p_exp || p_floor_log2 || p_log2) {
2807 /* TODO: optimize the constant case */
2808 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2809 LLVMIsConstant(x)) {
2810 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2811 __FUNCTION__);
2812 }
2813
2814 assert(type.floating && type.width == 32);
2815
2816 /*
2817 * We don't explicitly handle denormalized numbers. They will yield a
2818 * result in the neighbourhood of -127, which appears to be adequate
2819 * enough.
2820 */
2821
2822 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2823
2824 /* exp = (float) exponent(x) */
2825 exp = LLVMBuildAnd(builder, i, expmask, "");
2826 }
2827
2828 if(p_floor_log2 || p_log2) {
2829 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2830 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2831 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2832 }
2833
2834 if(p_log2) {
2835 /* mant = 1 + (float) mantissa(x) */
2836 mant = LLVMBuildAnd(builder, i, mantmask, "");
2837 mant = LLVMBuildOr(builder, mant, one, "");
2838 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2839
2840 /* y = (mant - 1) / (mant + 1) */
2841 y = lp_build_div(bld,
2842 lp_build_sub(bld, mant, bld->one),
2843 lp_build_add(bld, mant, bld->one)
2844 );
2845
2846 /* z = y^2 */
2847 z = lp_build_mul(bld, y, y);
2848
2849 /* compute P(z) */
2850 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
2851 Elements(lp_build_log2_polynomial));
2852
2853 /* logmant = y * P(z) */
2854 logmant = lp_build_mul(bld, y, logmant);
2855
2856 res = lp_build_add(bld, logmant, logexp);
2857 }
2858
2859 if(p_exp) {
2860 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2861 *p_exp = exp;
2862 }
2863
2864 if(p_floor_log2)
2865 *p_floor_log2 = logexp;
2866
2867 if(p_log2)
2868 *p_log2 = res;
2869 }
2870
2871
2872 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)2873 lp_build_log2(struct lp_build_context *bld,
2874 LLVMValueRef x)
2875 {
2876 LLVMValueRef res;
2877 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2878 return res;
2879 }
2880
2881
2882 /**
2883 * Faster (and less accurate) log2.
2884 *
2885 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2886 *
2887 * Piece-wise linear approximation, with exact results when x is a
2888 * power of two.
2889 *
2890 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2891 */
2892 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)2893 lp_build_fast_log2(struct lp_build_context *bld,
2894 LLVMValueRef x)
2895 {
2896 LLVMBuilderRef builder = bld->gallivm->builder;
2897 LLVMValueRef ipart;
2898 LLVMValueRef fpart;
2899
2900 assert(lp_check_value(bld->type, x));
2901
2902 assert(bld->type.floating);
2903
2904 /* ipart = floor(log2(x)) - 1 */
2905 ipart = lp_build_extract_exponent(bld, x, -1);
2906 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2907
2908 /* fpart = x / 2**ipart */
2909 fpart = lp_build_extract_mantissa(bld, x);
2910
2911 /* ipart + fpart */
2912 return LLVMBuildFAdd(builder, ipart, fpart, "");
2913 }
2914
2915
2916 /**
2917 * Fast implementation of iround(log2(x)).
2918 *
2919 * Not an approximation -- it should give accurate results all the time.
2920 */
2921 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)2922 lp_build_ilog2(struct lp_build_context *bld,
2923 LLVMValueRef x)
2924 {
2925 LLVMBuilderRef builder = bld->gallivm->builder;
2926 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2927 LLVMValueRef ipart;
2928
2929 assert(bld->type.floating);
2930
2931 assert(lp_check_value(bld->type, x));
2932
2933 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
2934 x = LLVMBuildFMul(builder, x, sqrt2, "");
2935
2936 /* ipart = floor(log2(x) + 0.5) */
2937 ipart = lp_build_extract_exponent(bld, x, 0);
2938
2939 return ipart;
2940 }
2941
2942 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)2943 lp_build_mod(struct lp_build_context *bld,
2944 LLVMValueRef x,
2945 LLVMValueRef y)
2946 {
2947 LLVMBuilderRef builder = bld->gallivm->builder;
2948 LLVMValueRef res;
2949 const struct lp_type type = bld->type;
2950
2951 assert(lp_check_value(type, x));
2952 assert(lp_check_value(type, y));
2953
2954 if (type.floating)
2955 res = LLVMBuildFRem(builder, x, y, "");
2956 else if (type.sign)
2957 res = LLVMBuildSRem(builder, x, y, "");
2958 else
2959 res = LLVMBuildURem(builder, x, y, "");
2960 return res;
2961 }
2962