1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for type conversions.
32 *
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
36 *
37 * Conversion between types of different bit width is quite complex since a
38 *
39 * To remember there are a few invariants in type conversions:
40 *
41 * - register width must remain constant:
42 *
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
44 *
45 * - total number of elements must remain constant:
46 *
47 * src_type.length * num_srcs == dst_type.length * num_dsts
48 *
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
57 *
58 * Make sure to run lp_test_conv unit test after any change to this file.
59 *
60 * @author Jose Fonseca <jfonseca@vmware.com>
61 */
62
63
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_cpu_detect.h"
67
68 #include "lp_bld_type.h"
69 #include "lp_bld_const.h"
70 #include "lp_bld_arit.h"
71 #include "lp_bld_pack.h"
72 #include "lp_bld_conv.h"
73 #include "lp_bld_logic.h"
74
75
76 /**
77 * Converts int16 half-float to float32
78 * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
79 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
80 *
81 * @param src_type <vector> type of int16
82 * @param src value to convert
83 *
84 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
85 */
86 LLVMValueRef
lp_build_half_to_float(struct gallivm_state * gallivm,struct lp_type src_type,LLVMValueRef src)87 lp_build_half_to_float(struct gallivm_state *gallivm,
88 struct lp_type src_type,
89 LLVMValueRef src)
90 {
91 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length);
92 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length);
93
94 LLVMBuilderRef builder = gallivm->builder;
95 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
96 LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
97
98 /* Constants */
99 LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
100 LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
101 LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
102 LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
103 LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
104 LLVMValueRef f32_magic = LLVMBuildBitCast(builder,
105 lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
106 float_vec_type, "");
107
108 /* Convert int16 vector to int32 vector by zero ext */
109 LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, "");
110
111 /* Exponent / mantissa bits */
112 LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
113 LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
114
115 /* Exponent adjust */
116 LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
117
118 /* Make sure Inf/NaN survive */
119 LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
120 LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
121
122 /* Sign bit */
123 LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, "");
124 LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, "");
125
126 /* Combine result */
127 LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, "");
128 LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, "");
129
130 /* Cast from int32 vector to float32 vector */
131 return LLVMBuildBitCast(builder, final, float_vec_type, "");
132 }
133
134
135 /**
136 * Special case for converting clamped IEEE-754 floats to unsigned norms.
137 *
138 * The mathematical voodoo below may seem excessive but it is actually
139 * paramount we do it this way for several reasons. First, there is no single
140 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
141 * secondly, even if there was, since the FP's mantissa takes only a fraction
142 * of register bits the typically scale and cast approach would require double
143 * precision for accurate results, and therefore half the throughput
144 *
145 * Although the result values can be scaled to an arbitrary bit width specified
146 * by dst_width, the actual result type will have the same width.
147 *
148 * Ex: src = { float, float, float, float }
149 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
150 */
151 LLVMValueRef
lp_build_clamped_float_to_unsigned_norm(struct gallivm_state * gallivm,struct lp_type src_type,unsigned dst_width,LLVMValueRef src)152 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
153 struct lp_type src_type,
154 unsigned dst_width,
155 LLVMValueRef src)
156 {
157 LLVMBuilderRef builder = gallivm->builder;
158 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
159 LLVMValueRef res;
160 unsigned mantissa;
161
162 assert(src_type.floating);
163 assert(dst_width <= src_type.width);
164 src_type.sign = FALSE;
165
166 mantissa = lp_mantissa(src_type);
167
168 if (dst_width <= mantissa) {
169 /*
170 * Apply magic coefficients that will make the desired result to appear
171 * in the lowest significant bits of the mantissa, with correct rounding.
172 *
173 * This only works if the destination width fits in the mantissa.
174 */
175
176 unsigned long long ubound;
177 unsigned long long mask;
178 double scale;
179 double bias;
180
181 ubound = (1ULL << dst_width);
182 mask = ubound - 1;
183 scale = (double)mask/ubound;
184 bias = (double)(1ULL << (mantissa - dst_width));
185
186 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
187 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
188 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
189 res = LLVMBuildAnd(builder, res,
190 lp_build_const_int_vec(gallivm, src_type, mask), "");
191 }
192 else if (dst_width == (mantissa + 1)) {
193 /*
194 * The destination width matches exactly what can be represented in
195 * floating point (i.e., mantissa + 1 bits). So do a straight
196 * multiplication followed by casting. No further rounding is necessary.
197 */
198
199 double scale;
200
201 scale = (double)((1ULL << dst_width) - 1);
202
203 res = LLVMBuildFMul(builder, src,
204 lp_build_const_vec(gallivm, src_type, scale), "");
205 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
206 }
207 else {
208 /*
209 * The destination exceeds what can be represented in the floating point.
210 * So multiply by the largest power two we get away with, and when
211 * subtract the most significant bit to rescale to normalized values.
212 *
213 * The largest power of two factor we can get away is
214 * (1 << (src_type.width - 1)), because we need to use signed . In theory it
215 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
216 * INT_MIN should be returned in FPToSI, which is the correct result for
217 * values near 1.0!
218 *
219 * This means we get (src_type.width - 1) correct bits for values near 0.0,
220 * and (mantissa + 1) correct bits for values near 1.0. Equally or more
221 * important, we also get exact results for 0.0 and 1.0.
222 */
223
224 unsigned n = MIN2(src_type.width - 1, dst_width);
225
226 double scale = (double)(1ULL << n);
227 unsigned lshift = dst_width - n;
228 unsigned rshift = n;
229 LLVMValueRef lshifted;
230 LLVMValueRef rshifted;
231
232 res = LLVMBuildFMul(builder, src,
233 lp_build_const_vec(gallivm, src_type, scale), "");
234 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
235
236 /*
237 * Align the most significant bit to its final place.
238 *
239 * This will cause 1.0 to overflow to 0, but the later adjustment will
240 * get it right.
241 */
242 if (lshift) {
243 lshifted = LLVMBuildShl(builder, res,
244 lp_build_const_int_vec(gallivm, src_type,
245 lshift), "");
246 } else {
247 lshifted = res;
248 }
249
250 /*
251 * Align the most significant bit to the right.
252 */
253 rshifted = LLVMBuildLShr(builder, res,
254 lp_build_const_int_vec(gallivm, src_type, rshift),
255 "");
256
257 /*
258 * Subtract the MSB to the LSB, therefore re-scaling from
259 * (1 << dst_width) to ((1 << dst_width) - 1).
260 */
261
262 res = LLVMBuildSub(builder, lshifted, rshifted, "");
263 }
264
265 return res;
266 }
267
268
269 /**
270 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
271 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
272 * return {float, float, float, float} with values in range [0, 1].
273 */
274 LLVMValueRef
lp_build_unsigned_norm_to_float(struct gallivm_state * gallivm,unsigned src_width,struct lp_type dst_type,LLVMValueRef src)275 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
276 unsigned src_width,
277 struct lp_type dst_type,
278 LLVMValueRef src)
279 {
280 LLVMBuilderRef builder = gallivm->builder;
281 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
282 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
283 LLVMValueRef bias_;
284 LLVMValueRef res;
285 unsigned mantissa;
286 unsigned n;
287 unsigned long long ubound;
288 unsigned long long mask;
289 double scale;
290 double bias;
291
292 assert(dst_type.floating);
293
294 mantissa = lp_mantissa(dst_type);
295
296 if (src_width <= (mantissa + 1)) {
297 /*
298 * The source width matches fits what can be represented in floating
299 * point (i.e., mantissa + 1 bits). So do a straight multiplication
300 * followed by casting. No further rounding is necessary.
301 */
302
303 scale = 1.0/(double)((1ULL << src_width) - 1);
304 res = LLVMBuildSIToFP(builder, src, vec_type, "");
305 res = LLVMBuildFMul(builder, res,
306 lp_build_const_vec(gallivm, dst_type, scale), "");
307 return res;
308 }
309 else {
310 /*
311 * The source width exceeds what can be represented in floating
312 * point. So truncate the incoming values.
313 */
314
315 n = MIN2(mantissa, src_width);
316
317 ubound = ((unsigned long long)1 << n);
318 mask = ubound - 1;
319 scale = (double)ubound/mask;
320 bias = (double)((unsigned long long)1 << (mantissa - n));
321
322 res = src;
323
324 if (src_width > mantissa) {
325 int shift = src_width - mantissa;
326 res = LLVMBuildLShr(builder, res,
327 lp_build_const_int_vec(gallivm, dst_type, shift), "");
328 }
329
330 bias_ = lp_build_const_vec(gallivm, dst_type, bias);
331
332 res = LLVMBuildOr(builder,
333 res,
334 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
335
336 res = LLVMBuildBitCast(builder, res, vec_type, "");
337
338 res = LLVMBuildFSub(builder, res, bias_, "");
339 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
340 }
341
342 return res;
343 }
344
345
346 /**
347 * Generic type conversion.
348 *
349 * TODO: Take a precision argument, or even better, add a new precision member
350 * to the lp_type union.
351 */
352 void
lp_build_conv(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,const LLVMValueRef * src,unsigned num_srcs,LLVMValueRef * dst,unsigned num_dsts)353 lp_build_conv(struct gallivm_state *gallivm,
354 struct lp_type src_type,
355 struct lp_type dst_type,
356 const LLVMValueRef *src, unsigned num_srcs,
357 LLVMValueRef *dst, unsigned num_dsts)
358 {
359 LLVMBuilderRef builder = gallivm->builder;
360 struct lp_type tmp_type;
361 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
362 unsigned num_tmps;
363 unsigned i;
364
365 /* We must not loose or gain channels. Only precision */
366 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
367
368 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
369 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
370 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
371 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
372
373 tmp_type = src_type;
374 for(i = 0; i < num_srcs; ++i) {
375 assert(lp_check_value(src_type, src[i]));
376 tmp[i] = src[i];
377 }
378 num_tmps = num_srcs;
379
380
381 /* Special case 4x4f --> 1x16ub
382 */
383 if (src_type.floating == 1 &&
384 src_type.fixed == 0 &&
385 src_type.sign == 1 &&
386 src_type.norm == 0 &&
387 src_type.width == 32 &&
388 src_type.length == 4 &&
389
390 dst_type.floating == 0 &&
391 dst_type.fixed == 0 &&
392 dst_type.sign == 0 &&
393 dst_type.norm == 1 &&
394 dst_type.width == 8 &&
395 dst_type.length == 16 &&
396
397 4 * num_dsts == num_srcs &&
398
399 util_cpu_caps.has_sse2)
400 {
401 struct lp_build_context bld;
402 struct lp_type int16_type = dst_type;
403 struct lp_type int32_type = dst_type;
404 LLVMValueRef const_255f;
405 unsigned i, j;
406
407 lp_build_context_init(&bld, gallivm, src_type);
408
409 int16_type.width *= 2;
410 int16_type.length /= 2;
411 int16_type.sign = 1;
412
413 int32_type.width *= 4;
414 int32_type.length /= 4;
415 int32_type.sign = 1;
416
417 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
418
419 for (i = 0; i < num_dsts; ++i, src += 4) {
420 LLVMValueRef lo, hi;
421
422 for (j = 0; j < 4; ++j) {
423 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
424 tmp[j] = lp_build_iround(&bld, tmp[j]);
425 }
426
427 /* relying on clamping behavior of sse2 intrinsics here */
428 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
429 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
430 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
431 }
432
433 return;
434 }
435
436 /* Special case 2x8f --> 1x16ub
437 */
438 else if (src_type.floating == 1 &&
439 src_type.fixed == 0 &&
440 src_type.sign == 1 &&
441 src_type.norm == 0 &&
442 src_type.width == 32 &&
443 src_type.length == 8 &&
444
445 dst_type.floating == 0 &&
446 dst_type.fixed == 0 &&
447 dst_type.sign == 0 &&
448 dst_type.norm == 1 &&
449 dst_type.width == 8 &&
450 dst_type.length == 16 &&
451
452 2 * num_dsts == num_srcs &&
453
454 util_cpu_caps.has_avx) {
455
456 struct lp_build_context bld;
457 struct lp_type int16_type = dst_type;
458 struct lp_type int32_type = dst_type;
459 LLVMValueRef const_255f;
460 unsigned i;
461
462 lp_build_context_init(&bld, gallivm, src_type);
463
464 int16_type.width *= 2;
465 int16_type.length /= 2;
466 int16_type.sign = 1;
467
468 int32_type.width *= 4;
469 int32_type.length /= 4;
470 int32_type.sign = 1;
471
472 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
473
474 for (i = 0; i < num_dsts; ++i, src += 2) {
475 LLVMValueRef lo, hi, a, b;
476
477 a = LLVMBuildFMul(builder, src[0], const_255f, "");
478 b = LLVMBuildFMul(builder, src[1], const_255f, "");
479
480 a = lp_build_iround(&bld, a);
481 b = lp_build_iround(&bld, b);
482
483 tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
484 tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
485 tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
486 tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
487
488 /* relying on clamping behavior of sse2 intrinsics here */
489 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
490 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
491 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
492 }
493 return;
494 }
495
496 /* Pre convert half-floats to floats
497 */
498 else if (src_type.floating && src_type.width == 16)
499 {
500 for(i = 0; i < num_tmps; ++i)
501 tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]);
502
503 tmp_type.width = 32;
504 }
505
506 /*
507 * Clamp if necessary
508 */
509
510 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
511 struct lp_build_context bld;
512 double src_min = lp_const_min(src_type);
513 double dst_min = lp_const_min(dst_type);
514 double src_max = lp_const_max(src_type);
515 double dst_max = lp_const_max(dst_type);
516 LLVMValueRef thres;
517
518 lp_build_context_init(&bld, gallivm, tmp_type);
519
520 if(src_min < dst_min) {
521 if(dst_min == 0.0)
522 thres = bld.zero;
523 else
524 thres = lp_build_const_vec(gallivm, src_type, dst_min);
525 for(i = 0; i < num_tmps; ++i)
526 tmp[i] = lp_build_max(&bld, tmp[i], thres);
527 }
528
529 if(src_max > dst_max) {
530 if(dst_max == 1.0)
531 thres = bld.one;
532 else
533 thres = lp_build_const_vec(gallivm, src_type, dst_max);
534 for(i = 0; i < num_tmps; ++i)
535 tmp[i] = lp_build_min(&bld, tmp[i], thres);
536 }
537 }
538
539 /*
540 * Scale to the narrowest range
541 */
542
543 if(dst_type.floating) {
544 /* Nothing to do */
545 }
546 else if(tmp_type.floating) {
547 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
548 for(i = 0; i < num_tmps; ++i) {
549 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
550 tmp_type,
551 dst_type.width,
552 tmp[i]);
553 }
554 tmp_type.floating = FALSE;
555 }
556 else {
557 double dst_scale = lp_const_scale(dst_type);
558 LLVMTypeRef tmp_vec_type;
559
560 if (dst_scale != 1.0) {
561 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
562 for(i = 0; i < num_tmps; ++i)
563 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
564 }
565
566 /* Use an equally sized integer for intermediate computations */
567 tmp_type.floating = FALSE;
568 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
569 for(i = 0; i < num_tmps; ++i) {
570 #if 0
571 if(dst_type.sign)
572 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
573 else
574 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
575 #else
576 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
577 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
578 #endif
579 }
580 }
581 }
582 else {
583 unsigned src_shift = lp_const_shift(src_type);
584 unsigned dst_shift = lp_const_shift(dst_type);
585 unsigned src_offset = lp_const_offset(src_type);
586 unsigned dst_offset = lp_const_offset(dst_type);
587
588 /* Compensate for different offsets */
589 if (dst_offset > src_offset && src_type.width > dst_type.width) {
590 for (i = 0; i < num_tmps; ++i) {
591 LLVMValueRef shifted;
592 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
593 if(src_type.sign)
594 shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
595 else
596 shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
597
598 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
599 }
600 }
601
602 if(src_shift > dst_shift) {
603 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
604 src_shift - dst_shift);
605 for(i = 0; i < num_tmps; ++i)
606 if(src_type.sign)
607 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
608 else
609 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
610 }
611 }
612
613 /*
614 * Truncate or expand bit width
615 *
616 * No data conversion should happen here, although the sign bits are
617 * crucial to avoid bad clamping.
618 */
619
620 {
621 struct lp_type new_type;
622
623 new_type = tmp_type;
624 new_type.sign = dst_type.sign;
625 new_type.width = dst_type.width;
626 new_type.length = dst_type.length;
627
628 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
629
630 tmp_type = new_type;
631 num_tmps = num_dsts;
632 }
633
634 /*
635 * Scale to the widest range
636 */
637
638 if(src_type.floating) {
639 /* Nothing to do */
640 }
641 else if(!src_type.floating && dst_type.floating) {
642 if(!src_type.fixed && !src_type.sign && src_type.norm) {
643 for(i = 0; i < num_tmps; ++i) {
644 tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
645 src_type.width,
646 dst_type,
647 tmp[i]);
648 }
649 tmp_type.floating = TRUE;
650 }
651 else {
652 double src_scale = lp_const_scale(src_type);
653 LLVMTypeRef tmp_vec_type;
654
655 /* Use an equally sized integer for intermediate computations */
656 tmp_type.floating = TRUE;
657 tmp_type.sign = TRUE;
658 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
659 for(i = 0; i < num_tmps; ++i) {
660 #if 0
661 if(dst_type.sign)
662 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
663 else
664 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
665 #else
666 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
667 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
668 #endif
669 }
670
671 if (src_scale != 1.0) {
672 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
673 for(i = 0; i < num_tmps; ++i)
674 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
675 }
676 }
677 }
678 else {
679 unsigned src_shift = lp_const_shift(src_type);
680 unsigned dst_shift = lp_const_shift(dst_type);
681 unsigned src_offset = lp_const_offset(src_type);
682 unsigned dst_offset = lp_const_offset(dst_type);
683
684 if (src_shift < dst_shift) {
685 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
686 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
687
688 for (i = 0; i < num_tmps; ++i) {
689 pre_shift[i] = tmp[i];
690 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
691 }
692
693 /* Compensate for different offsets */
694 if (dst_offset > src_offset) {
695 for (i = 0; i < num_tmps; ++i) {
696 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
697 }
698 }
699 }
700 }
701
702 for(i = 0; i < num_dsts; ++i) {
703 dst[i] = tmp[i];
704 assert(lp_check_value(dst_type, dst[i]));
705 }
706 }
707
708
709 /**
710 * Bit mask conversion.
711 *
712 * This will convert the integer masks that match the given types.
713 *
714 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
715 * Any other value will likely cause unpredictable results.
716 *
717 * This is basically a very trimmed down version of lp_build_conv.
718 */
719 void
lp_build_conv_mask(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,const LLVMValueRef * src,unsigned num_srcs,LLVMValueRef * dst,unsigned num_dsts)720 lp_build_conv_mask(struct gallivm_state *gallivm,
721 struct lp_type src_type,
722 struct lp_type dst_type,
723 const LLVMValueRef *src, unsigned num_srcs,
724 LLVMValueRef *dst, unsigned num_dsts)
725 {
726
727 /* We must not loose or gain channels. Only precision */
728 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
729
730 /*
731 * Drop
732 *
733 * We assume all values are 0 or -1
734 */
735
736 src_type.floating = FALSE;
737 src_type.fixed = FALSE;
738 src_type.sign = TRUE;
739 src_type.norm = FALSE;
740
741 dst_type.floating = FALSE;
742 dst_type.fixed = FALSE;
743 dst_type.sign = TRUE;
744 dst_type.norm = FALSE;
745
746 /*
747 * Truncate or expand bit width
748 */
749
750 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
751 }
752