1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions for computing color endpoints and texel weights.
22 */
23
24 #include <cassert>
25
26 #include "astcenc_internal.h"
27 #include "astcenc_vecmathlib.h"
28
29 /**
30 * @brief Compute the infilled weight for N texel indices in a decimated grid.
31 *
32 * @param di The weight grid decimation to use.
33 * @param weights The decimated weight values to use.
34 * @param index The first texel index to interpolate.
35 *
36 * @return The interpolated weight for the given set of SIMD_WIDTH texels.
37 */
bilinear_infill_vla(const decimation_info & di,const float * weights,unsigned int index)38 static vfloat bilinear_infill_vla(
39 const decimation_info& di,
40 const float* weights,
41 unsigned int index
42 ) {
43 // Load the bilinear filter texel weight indexes in the decimated grid
44 vint weight_idx0 = vint(di.texel_weights_4t[0] + index);
45 vint weight_idx1 = vint(di.texel_weights_4t[1] + index);
46 vint weight_idx2 = vint(di.texel_weights_4t[2] + index);
47 vint weight_idx3 = vint(di.texel_weights_4t[3] + index);
48
49 // Load the bilinear filter weights from the decimated grid
50 vfloat weight_val0 = gatherf(weights, weight_idx0);
51 vfloat weight_val1 = gatherf(weights, weight_idx1);
52 vfloat weight_val2 = gatherf(weights, weight_idx2);
53 vfloat weight_val3 = gatherf(weights, weight_idx3);
54
55 // Load the weight contribution factors for each decimated weight
56 vfloat tex_weight_float0 = loada(di.texel_weights_float_4t[0] + index);
57 vfloat tex_weight_float1 = loada(di.texel_weights_float_4t[1] + index);
58 vfloat tex_weight_float2 = loada(di.texel_weights_float_4t[2] + index);
59 vfloat tex_weight_float3 = loada(di.texel_weights_float_4t[3] + index);
60
61 // Compute the bilinear interpolation to generate the per-texel weight
62 return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
63 (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
64 }
65
66 /**
67 * @brief Compute the infilled weight for N texel indices in a decimated grid.
68 *
69 * This is specialized version which computes only two weights per texel for
70 * encodings that are only decimated in a single axis.
71 *
72 * @param di The weight grid decimation to use.
73 * @param weights The decimated weight values to use.
74 * @param index The first texel index to interpolate.
75 *
76 * @return The interpolated weight for the given set of SIMD_WIDTH texels.
77 */
bilinear_infill_vla_2(const decimation_info & di,const float * weights,unsigned int index)78 static vfloat bilinear_infill_vla_2(
79 const decimation_info& di,
80 const float* weights,
81 unsigned int index
82 ) {
83 // Load the bilinear filter texel weight indexes in the decimated grid
84 vint weight_idx0 = vint(di.texel_weights_4t[0] + index);
85 vint weight_idx1 = vint(di.texel_weights_4t[1] + index);
86
87 // Load the bilinear filter weights from the decimated grid
88 vfloat weight_val0 = gatherf(weights, weight_idx0);
89 vfloat weight_val1 = gatherf(weights, weight_idx1);
90
91 // Load the weight contribution factors for each decimated weight
92 vfloat tex_weight_float0 = loada(di.texel_weights_float_4t[0] + index);
93 vfloat tex_weight_float1 = loada(di.texel_weights_float_4t[1] + index);
94
95 // Compute the bilinear interpolation to generate the per-texel weight
96 return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
97 }
98
99 /**
100 * @brief Compute the ideal endpoints and weights for 1 color component.
101 *
102 * @param blk The image block color data to compress.
103 * @param pi The partition info for the current trial.
104 * @param[out] ei The computed ideal endpoints and weights.
105 * @param component The color component to compute.
106 */
compute_ideal_colors_and_weights_1_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,unsigned int component)107 static void compute_ideal_colors_and_weights_1_comp(
108 const image_block& blk,
109 const partition_info& pi,
110 endpoints_and_weights& ei,
111 unsigned int component
112 ) {
113 unsigned int partition_count = pi.partition_count;
114 ei.ep.partition_count = partition_count;
115 promise(partition_count > 0);
116
117 unsigned int texel_count = blk.texel_count;
118 promise(texel_count > 0);
119
120 float error_weight;
121 const float* data_vr = nullptr;
122
123 assert(component < BLOCK_MAX_COMPONENTS);
124 switch (component)
125 {
126 case 0:
127 error_weight = blk.channel_weight.lane<0>();
128 data_vr = blk.data_r;
129 break;
130 case 1:
131 error_weight = blk.channel_weight.lane<1>();
132 data_vr = blk.data_g;
133 break;
134 case 2:
135 error_weight = blk.channel_weight.lane<2>();
136 data_vr = blk.data_b;
137 break;
138 default:
139 assert(component == 3);
140 error_weight = blk.channel_weight.lane<3>();
141 data_vr = blk.data_a;
142 break;
143 }
144
145 vmask4 sep_mask = vint4::lane_id() == vint4(component);
146 bool is_constant_wes { true };
147 float partition0_len_sq { 0.0f };
148
149 for (unsigned int i = 0; i < partition_count; i++)
150 {
151 float lowvalue { 1e10f };
152 float highvalue { -1e10f };
153
154 unsigned int partition_texel_count = pi.partition_texel_count[i];
155 for (unsigned int j = 0; j < partition_texel_count; j++)
156 {
157 unsigned int tix = pi.texels_of_partition[i][j];
158 float value = data_vr[tix];
159 lowvalue = astc::min(value, lowvalue);
160 highvalue = astc::max(value, highvalue);
161 }
162
163 if (highvalue <= lowvalue)
164 {
165 lowvalue = 0.0f;
166 highvalue = 1e-7f;
167 }
168
169 float length = highvalue - lowvalue;
170 float length_squared = length * length;
171 float scale = 1.0f / length;
172
173 if (i == 0)
174 {
175 partition0_len_sq = length_squared;
176 }
177 else
178 {
179 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
180 }
181
182 for (unsigned int j = 0; j < partition_texel_count; j++)
183 {
184 unsigned int tix = pi.texels_of_partition[i][j];
185 float value = (data_vr[tix] - lowvalue) * scale;
186 value = astc::clamp1f(value);
187
188 ei.weights[tix] = value;
189 ei.weight_error_scale[tix] = length_squared * error_weight;
190 assert(!astc::isnan(ei.weight_error_scale[tix]));
191 }
192
193 ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
194 ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
195 }
196
197 // Zero initialize any SIMD over-fetch
198 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
199 for (unsigned int i = texel_count; i < texel_count_simd; i++)
200 {
201 ei.weights[i] = 0.0f;
202 ei.weight_error_scale[i] = 0.0f;
203 }
204
205 ei.is_constant_weight_error_scale = is_constant_wes;
206 }
207
208 /**
209 * @brief Compute the ideal endpoints and weights for 2 color components.
210 *
211 * @param blk The image block color data to compress.
212 * @param pi The partition info for the current trial.
213 * @param[out] ei The computed ideal endpoints and weights.
214 * @param component1 The first color component to compute.
215 * @param component2 The second color component to compute.
216 */
compute_ideal_colors_and_weights_2_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,int component1,int component2)217 static void compute_ideal_colors_and_weights_2_comp(
218 const image_block& blk,
219 const partition_info& pi,
220 endpoints_and_weights& ei,
221 int component1,
222 int component2
223 ) {
224 unsigned int partition_count = pi.partition_count;
225 ei.ep.partition_count = partition_count;
226 promise(partition_count > 0);
227
228 unsigned int texel_count = blk.texel_count;
229 promise(texel_count > 0);
230
231 partition_metrics pms[BLOCK_MAX_PARTITIONS];
232
233 float error_weight;
234 const float* data_vr = nullptr;
235 const float* data_vg = nullptr;
236
237 if (component1 == 0 && component2 == 1)
238 {
239 error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
240
241 data_vr = blk.data_r;
242 data_vg = blk.data_g;
243 }
244 else if (component1 == 0 && component2 == 2)
245 {
246 error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
247
248 data_vr = blk.data_r;
249 data_vg = blk.data_b;
250 }
251 else // (component1 == 1 && component2 == 2)
252 {
253 assert(component1 == 1 && component2 == 2);
254
255 error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
256
257 data_vr = blk.data_g;
258 data_vg = blk.data_b;
259 }
260
261 compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
262
263 bool is_constant_wes { true };
264 float partition0_len_sq { 0.0f };
265
266 vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
267 vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
268
269 for (unsigned int i = 0; i < partition_count; i++)
270 {
271 vfloat4 dir = pms[i].dir;
272 if (hadd_s(dir) < 0.0f)
273 {
274 dir = vfloat4::zero() - dir;
275 }
276
277 line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
278 float lowparam { 1e10f };
279 float highparam { -1e10f };
280
281 unsigned int partition_texel_count = pi.partition_texel_count[i];
282 for (unsigned int j = 0; j < partition_texel_count; j++)
283 {
284 unsigned int tix = pi.texels_of_partition[i][j];
285 vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
286 float param = dot_s(point - line.a, line.b);
287 ei.weights[tix] = param;
288
289 lowparam = astc::min(param, lowparam);
290 highparam = astc::max(param, highparam);
291 }
292
293 // It is possible for a uniform-color partition to produce length=0;
294 // this causes NaN issues so set to small value to avoid this problem
295 if (highparam <= lowparam)
296 {
297 lowparam = 0.0f;
298 highparam = 1e-7f;
299 }
300
301 float length = highparam - lowparam;
302 float length_squared = length * length;
303 float scale = 1.0f / length;
304
305 if (i == 0)
306 {
307 partition0_len_sq = length_squared;
308 }
309 else
310 {
311 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
312 }
313
314 for (unsigned int j = 0; j < partition_texel_count; j++)
315 {
316 unsigned int tix = pi.texels_of_partition[i][j];
317 float idx = (ei.weights[tix] - lowparam) * scale;
318 idx = astc::clamp1f(idx);
319
320 ei.weights[tix] = idx;
321 ei.weight_error_scale[tix] = length_squared * error_weight;
322 assert(!astc::isnan(ei.weight_error_scale[tix]));
323 }
324
325 vfloat4 lowvalue = line.a + line.b * lowparam;
326 vfloat4 highvalue = line.a + line.b * highparam;
327
328 vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
329 vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
330
331 ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
332 ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
333 }
334
335 // Zero initialize any SIMD over-fetch
336 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
337 for (unsigned int i = texel_count; i < texel_count_simd; i++)
338 {
339 ei.weights[i] = 0.0f;
340 ei.weight_error_scale[i] = 0.0f;
341 }
342
343 ei.is_constant_weight_error_scale = is_constant_wes;
344 }
345
346 /**
347 * @brief Compute the ideal endpoints and weights for 3 color components.
348 *
349 * @param blk The image block color data to compress.
350 * @param pi The partition info for the current trial.
351 * @param[out] ei The computed ideal endpoints and weights.
352 * @param omitted_component The color component excluded from the calculation.
353 */
compute_ideal_colors_and_weights_3_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,unsigned int omitted_component)354 static void compute_ideal_colors_and_weights_3_comp(
355 const image_block& blk,
356 const partition_info& pi,
357 endpoints_and_weights& ei,
358 unsigned int omitted_component
359 ) {
360 unsigned int partition_count = pi.partition_count;
361 ei.ep.partition_count = partition_count;
362 promise(partition_count > 0);
363
364 unsigned int texel_count = blk.texel_count;
365 promise(texel_count > 0);
366
367 partition_metrics pms[BLOCK_MAX_PARTITIONS];
368
369 float error_weight;
370 const float* data_vr = nullptr;
371 const float* data_vg = nullptr;
372 const float* data_vb = nullptr;
373 if (omitted_component == 0)
374 {
375 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
376 data_vr = blk.data_g;
377 data_vg = blk.data_b;
378 data_vb = blk.data_a;
379 }
380 else if (omitted_component == 1)
381 {
382 error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
383 data_vr = blk.data_r;
384 data_vg = blk.data_b;
385 data_vb = blk.data_a;
386 }
387 else if (omitted_component == 2)
388 {
389 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
390 data_vr = blk.data_r;
391 data_vg = blk.data_g;
392 data_vb = blk.data_a;
393 }
394 else
395 {
396 assert(omitted_component == 3);
397
398 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
399 data_vr = blk.data_r;
400 data_vg = blk.data_g;
401 data_vb = blk.data_b;
402 }
403
404 error_weight = error_weight * (1.0f / 3.0f);
405
406 if (omitted_component == 3)
407 {
408 compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
409 }
410 else
411 {
412 compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
413 }
414
415 bool is_constant_wes { true };
416 float partition0_len_sq { 0.0f };
417
418 for (unsigned int i = 0; i < partition_count; i++)
419 {
420 vfloat4 dir = pms[i].dir;
421 if (hadd_rgb_s(dir) < 0.0f)
422 {
423 dir = vfloat4::zero() - dir;
424 }
425
426 line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
427 float lowparam { 1e10f };
428 float highparam { -1e10f };
429
430 unsigned int partition_texel_count = pi.partition_texel_count[i];
431 for (unsigned int j = 0; j < partition_texel_count; j++)
432 {
433 unsigned int tix = pi.texels_of_partition[i][j];
434 vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
435 float param = dot3_s(point - line.a, line.b);
436 ei.weights[tix] = param;
437
438 lowparam = astc::min(param, lowparam);
439 highparam = astc::max(param, highparam);
440 }
441
442 // It is possible for a uniform-color partition to produce length=0;
443 // this causes NaN issues so set to small value to avoid this problem
444 if (highparam <= lowparam)
445 {
446 lowparam = 0.0f;
447 highparam = 1e-7f;
448 }
449
450 float length = highparam - lowparam;
451 float length_squared = length * length;
452 float scale = 1.0f / length;
453
454 if (i == 0)
455 {
456 partition0_len_sq = length_squared;
457 }
458 else
459 {
460 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
461 }
462
463 for (unsigned int j = 0; j < partition_texel_count; j++)
464 {
465 unsigned int tix = pi.texels_of_partition[i][j];
466 float idx = (ei.weights[tix] - lowparam) * scale;
467 idx = astc::clamp1f(idx);
468
469 ei.weights[tix] = idx;
470 ei.weight_error_scale[tix] = length_squared * error_weight;
471 assert(!astc::isnan(ei.weight_error_scale[tix]));
472 }
473
474 vfloat4 ep0 = line.a + line.b * lowparam;
475 vfloat4 ep1 = line.a + line.b * highparam;
476
477 vfloat4 bmin = blk.data_min;
478 vfloat4 bmax = blk.data_max;
479
480 assert(omitted_component < BLOCK_MAX_COMPONENTS);
481 switch (omitted_component)
482 {
483 case 0:
484 ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
485 ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
486 break;
487 case 1:
488 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
489 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
490 break;
491 case 2:
492 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
493 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
494 break;
495 default:
496 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
497 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
498 break;
499 }
500 }
501
502 // Zero initialize any SIMD over-fetch
503 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
504 for (unsigned int i = texel_count; i < texel_count_simd; i++)
505 {
506 ei.weights[i] = 0.0f;
507 ei.weight_error_scale[i] = 0.0f;
508 }
509
510 ei.is_constant_weight_error_scale = is_constant_wes;
511 }
512
513 /**
514 * @brief Compute the ideal endpoints and weights for 4 color components.
515 *
516 * @param blk The image block color data to compress.
517 * @param pi The partition info for the current trial.
518 * @param[out] ei The computed ideal endpoints and weights.
519 */
compute_ideal_colors_and_weights_4_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei)520 static void compute_ideal_colors_and_weights_4_comp(
521 const image_block& blk,
522 const partition_info& pi,
523 endpoints_and_weights& ei
524 ) {
525 const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
526
527 unsigned int partition_count = pi.partition_count;
528
529 unsigned int texel_count = blk.texel_count;
530 promise(texel_count > 0);
531 promise(partition_count > 0);
532
533 partition_metrics pms[BLOCK_MAX_PARTITIONS];
534
535 compute_avgs_and_dirs_4_comp(pi, blk, pms);
536
537 bool is_constant_wes { true };
538 float partition0_len_sq { 0.0f };
539
540 for (unsigned int i = 0; i < partition_count; i++)
541 {
542 vfloat4 dir = pms[i].dir;
543 if (hadd_rgb_s(dir) < 0.0f)
544 {
545 dir = vfloat4::zero() - dir;
546 }
547
548 line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
549 float lowparam { 1e10f };
550 float highparam { -1e10f };
551
552 unsigned int partition_texel_count = pi.partition_texel_count[i];
553 for (unsigned int j = 0; j < partition_texel_count; j++)
554 {
555 unsigned int tix = pi.texels_of_partition[i][j];
556 vfloat4 point = blk.texel(tix);
557 float param = dot_s(point - line.a, line.b);
558 ei.weights[tix] = param;
559
560 lowparam = astc::min(param, lowparam);
561 highparam = astc::max(param, highparam);
562 }
563
564 // It is possible for a uniform-color partition to produce length=0;
565 // this causes NaN issues so set to small value to avoid this problem
566 if (highparam <= lowparam)
567 {
568 lowparam = 0.0f;
569 highparam = 1e-7f;
570 }
571
572 float length = highparam - lowparam;
573 float length_squared = length * length;
574 float scale = 1.0f / length;
575
576 if (i == 0)
577 {
578 partition0_len_sq = length_squared;
579 }
580 else
581 {
582 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
583 }
584
585 ei.ep.endpt0[i] = line.a + line.b * lowparam;
586 ei.ep.endpt1[i] = line.a + line.b * highparam;
587
588 for (unsigned int j = 0; j < partition_texel_count; j++)
589 {
590 unsigned int tix = pi.texels_of_partition[i][j];
591 float idx = (ei.weights[tix] - lowparam) * scale;
592 idx = astc::clamp1f(idx);
593
594 ei.weights[tix] = idx;
595 ei.weight_error_scale[tix] = length_squared * error_weight;
596 assert(!astc::isnan(ei.weight_error_scale[tix]));
597 }
598 }
599
600 // Zero initialize any SIMD over-fetch
601 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
602 for (unsigned int i = texel_count; i < texel_count_simd; i++)
603 {
604 ei.weights[i] = 0.0f;
605 ei.weight_error_scale[i] = 0.0f;
606 }
607
608 ei.is_constant_weight_error_scale = is_constant_wes;
609 }
610
611 /* See header for documentation. */
compute_ideal_colors_and_weights_1plane(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei)612 void compute_ideal_colors_and_weights_1plane(
613 const image_block& blk,
614 const partition_info& pi,
615 endpoints_and_weights& ei
616 ) {
617 bool uses_alpha = !blk.is_constant_channel(3);
618
619 if (uses_alpha)
620 {
621 compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
622 }
623 else
624 {
625 compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
626 }
627 }
628
629 /* See header for documentation. */
compute_ideal_colors_and_weights_2planes(const block_size_descriptor & bsd,const image_block & blk,unsigned int plane2_component,endpoints_and_weights & ei1,endpoints_and_weights & ei2)630 void compute_ideal_colors_and_weights_2planes(
631 const block_size_descriptor& bsd,
632 const image_block& blk,
633 unsigned int plane2_component,
634 endpoints_and_weights& ei1,
635 endpoints_and_weights& ei2
636 ) {
637 const auto& pi = bsd.get_partition_info(1, 0);
638 bool uses_alpha = !blk.is_constant_channel(3);
639
640 assert(plane2_component < BLOCK_MAX_COMPONENTS);
641 switch (plane2_component)
642 {
643 case 0: // Separate weights for red
644 if (uses_alpha)
645 {
646 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
647 }
648 else
649 {
650 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
651 }
652 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
653 break;
654
655 case 1: // Separate weights for green
656 if (uses_alpha)
657 {
658 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
659 }
660 else
661 {
662 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
663 }
664 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
665 break;
666
667 case 2: // Separate weights for blue
668 if (uses_alpha)
669 {
670 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
671 }
672 else
673 {
674 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
675 }
676 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
677 break;
678
679 default: // Separate weights for alpha
680 assert(uses_alpha);
681 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
682 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
683 break;
684 }
685 }
686
687 /* See header for documentation. */
compute_error_of_weight_set_1plane(const endpoints_and_weights & eai,const decimation_info & di,const float * dec_weight_quant_uvalue)688 float compute_error_of_weight_set_1plane(
689 const endpoints_and_weights& eai,
690 const decimation_info& di,
691 const float* dec_weight_quant_uvalue
692 ) {
693 vfloatacc error_summav = vfloatacc::zero();
694 float error_summa = 0.0f;
695 unsigned int texel_count = di.texel_count;
696
697 // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
698 if (di.max_texel_weight_count > 2)
699 {
700 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
701 {
702 // Compute the bilinear interpolation of the decimated weight grid
703 vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
704
705 // Compute the error between the computed value and the ideal weight
706 vfloat actual_values = loada(eai.weights + i);
707 vfloat diff = current_values - actual_values;
708 vfloat significance = loada(eai.weight_error_scale + i);
709 vfloat error = diff * diff * significance;
710
711 haccumulate(error_summav, error);
712 }
713 }
714 else if (di.max_texel_weight_count > 1)
715 {
716 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
717 {
718 // Compute the bilinear interpolation of the decimated weight grid
719 vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
720
721 // Compute the error between the computed value and the ideal weight
722 vfloat actual_values = loada(eai.weights + i);
723 vfloat diff = current_values - actual_values;
724 vfloat significance = loada(eai.weight_error_scale + i);
725 vfloat error = diff * diff * significance;
726
727 haccumulate(error_summav, error);
728 }
729 }
730 else
731 {
732 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
733 {
734 // Load the weight set directly, without interpolation
735 vfloat current_values = loada(dec_weight_quant_uvalue + i);
736
737 // Compute the error between the computed value and the ideal weight
738 vfloat actual_values = loada(eai.weights + i);
739 vfloat diff = current_values - actual_values;
740 vfloat significance = loada(eai.weight_error_scale + i);
741 vfloat error = diff * diff * significance;
742
743 haccumulate(error_summav, error);
744 }
745 }
746
747 // Resolve the final scalar accumulator sum
748 return error_summa = hadd_s(error_summav);
749 }
750
751 /* See header for documentation. */
compute_error_of_weight_set_2planes(const endpoints_and_weights & eai1,const endpoints_and_weights & eai2,const decimation_info & di,const float * dec_weight_quant_uvalue_plane1,const float * dec_weight_quant_uvalue_plane2)752 float compute_error_of_weight_set_2planes(
753 const endpoints_and_weights& eai1,
754 const endpoints_and_weights& eai2,
755 const decimation_info& di,
756 const float* dec_weight_quant_uvalue_plane1,
757 const float* dec_weight_quant_uvalue_plane2
758 ) {
759 vfloatacc error_summav = vfloatacc::zero();
760 unsigned int texel_count = di.texel_count;
761
762 // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
763 if (di.max_texel_weight_count > 2)
764 {
765 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
766 {
767 // Plane 1
768 // Compute the bilinear interpolation of the decimated weight grid
769 vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
770
771 // Compute the error between the computed value and the ideal weight
772 vfloat actual_values1 = loada(eai1.weights + i);
773 vfloat diff = current_values1 - actual_values1;
774 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
775
776 // Plane 2
777 // Compute the bilinear interpolation of the decimated weight grid
778 vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
779
780 // Compute the error between the computed value and the ideal weight
781 vfloat actual_values2 = loada(eai2.weights + i);
782 diff = current_values2 - actual_values2;
783 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
784
785 haccumulate(error_summav, error1 + error2);
786 }
787 }
788 else if (di.max_texel_weight_count > 1)
789 {
790 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
791 {
792 // Plane 1
793 // Compute the bilinear interpolation of the decimated weight grid
794 vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
795
796 // Compute the error between the computed value and the ideal weight
797 vfloat actual_values1 = loada(eai1.weights + i);
798 vfloat diff = current_values1 - actual_values1;
799 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
800
801 // Plane 2
802 // Compute the bilinear interpolation of the decimated weight grid
803 vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
804
805 // Compute the error between the computed value and the ideal weight
806 vfloat actual_values2 = loada(eai2.weights + i);
807 diff = current_values2 - actual_values2;
808 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
809
810 haccumulate(error_summav, error1 + error2);
811 }
812 }
813 else
814 {
815 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
816 {
817 // Plane 1
818 // Load the weight set directly, without interpolation
819 vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
820
821 // Compute the error between the computed value and the ideal weight
822 vfloat actual_values1 = loada(eai1.weights + i);
823 vfloat diff = current_values1 - actual_values1;
824 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
825
826 // Plane 2
827 // Load the weight set directly, without interpolation
828 vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
829
830 // Compute the error between the computed value and the ideal weight
831 vfloat actual_values2 = loada(eai2.weights + i);
832 diff = current_values2 - actual_values2;
833 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
834
835 haccumulate(error_summav, error1 + error2);
836 }
837 }
838
839 // Resolve the final scalar accumulator sum
840 return hadd_s(error_summav);
841 }
842
843 /* See header for documentation. */
compute_ideal_weights_for_decimation(const endpoints_and_weights & ei,const decimation_info & di,float * dec_weight_ideal_value)844 void compute_ideal_weights_for_decimation(
845 const endpoints_and_weights& ei,
846 const decimation_info& di,
847 float* dec_weight_ideal_value
848 ) {
849 unsigned int texel_count = di.texel_count;
850 unsigned int weight_count = di.weight_count;
851 bool is_direct = texel_count == weight_count;
852 promise(texel_count > 0);
853 promise(weight_count > 0);
854
855 // Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
856 // can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
857 // arrays always contain space for 64 elements
858 unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
859 storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
860
861 // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
862 // zero-initialized SIMD over-fetch region
863 if (is_direct)
864 {
865 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
866 for (unsigned int i = 0; i < texel_count_simd; i += ASTCENC_SIMD_WIDTH)
867 {
868 vfloat weight(ei.weights + i);
869 storea(weight, dec_weight_ideal_value + i);
870 }
871
872 return;
873 }
874
875 // Otherwise compute an estimate and perform single refinement iteration
876 alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
877
878 // Compute an initial average for each decimated weight
879 bool constant_wes = ei.is_constant_weight_error_scale;
880 vfloat weight_error_scale(ei.weight_error_scale[0]);
881
882 // This overshoots - this is OK as we initialize the array tails in the
883 // decimation table structures to safe values ...
884 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
885 {
886 // Start with a small value to avoid div-by-zero later
887 vfloat weight_weight(1e-10f);
888 vfloat initial_weight = vfloat::zero();
889
890 // Accumulate error weighting of all the texels using this weight
891 vint weight_texel_count(di.weight_texel_count + i);
892 unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
893 promise(max_texel_count > 0);
894
895 for (unsigned int j = 0; j < max_texel_count; j++)
896 {
897 vint texel(di.weight_texel[j] + i);
898 vfloat weight = loada(di.weights_flt[j] + i);
899
900 if (!constant_wes)
901 {
902 weight_error_scale = gatherf(ei.weight_error_scale, texel);
903 }
904
905 vfloat contrib_weight = weight * weight_error_scale;
906
907 weight_weight += contrib_weight;
908 initial_weight += gatherf(ei.weights, texel) * contrib_weight;
909 }
910
911 storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
912 }
913
914 // Populate the interpolated weight grid based on the initial average
915 // Process SIMD-width texel coordinates at at time while we can. Safe to
916 // over-process full SIMD vectors - the tail is zeroed.
917 if (di.max_texel_weight_count <= 2)
918 {
919 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
920 {
921 vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
922 storea(weight, infilled_weights + i);
923 }
924 }
925 else
926 {
927 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
928 {
929 vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
930 storea(weight, infilled_weights + i);
931 }
932 }
933
934 // Perform a single iteration of refinement
935 // Empirically determined step size; larger values don't help but smaller drops image quality
936 constexpr float stepsize = 0.25f;
937 constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
938
939 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
940 {
941 vfloat weight_val = loada(dec_weight_ideal_value + i);
942
943 // Accumulate error weighting of all the texels using this weight
944 // Start with a small value to avoid div-by-zero later
945 vfloat error_change0(1e-10f);
946 vfloat error_change1(0.0f);
947
948 // Accumulate error weighting of all the texels using this weight
949 vint weight_texel_count(di.weight_texel_count + i);
950 unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
951 promise(max_texel_count > 0);
952
953 for (unsigned int j = 0; j < max_texel_count; j++)
954 {
955 vint texel(di.weight_texel[j] + i);
956 vfloat contrib_weight = loada(di.weights_flt[j] + i);
957
958 if (!constant_wes)
959 {
960 weight_error_scale = gatherf(ei.weight_error_scale, texel);
961 }
962
963 vfloat scale = weight_error_scale * contrib_weight;
964 vfloat old_weight = gatherf(infilled_weights, texel);
965 vfloat ideal_weight = gatherf(ei.weights, texel);
966
967 error_change0 += contrib_weight * scale;
968 error_change1 += (old_weight - ideal_weight) * scale;
969 }
970
971 vfloat step = (error_change1 * chd_scale) / error_change0;
972 step = clamp(-stepsize, stepsize, step);
973
974 // Update the weight; note this can store negative values.
975 storea(weight_val + step, dec_weight_ideal_value + i);
976 }
977 }
978
979 /* See header for documentation. */
compute_quantized_weights_for_decimation(const decimation_info & di,float low_bound,float high_bound,const float * dec_weight_ideal_value,float * weight_set_out,uint8_t * quantized_weight_set,quant_method quant_level)980 void compute_quantized_weights_for_decimation(
981 const decimation_info& di,
982 float low_bound,
983 float high_bound,
984 const float* dec_weight_ideal_value,
985 float* weight_set_out,
986 uint8_t* quantized_weight_set,
987 quant_method quant_level
988 ) {
989 int weight_count = di.weight_count;
990 promise(weight_count > 0);
991 const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
992
993 // The available quant levels, stored with a minus 1 bias
994 static const float quant_levels_m1[12] {
995 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
996 };
997
998 vint steps_m1(get_quant_level(quant_level) - 1);
999 float quant_level_m1 = quant_levels_m1[quant_level];
1000
1001 // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
1002
1003 // TODO: Oddity to investigate; triggered by test in issue #265.
1004 if (high_bound <= low_bound)
1005 {
1006 low_bound = 0.0f;
1007 high_bound = 1.0f;
1008 }
1009
1010 float rscale = high_bound - low_bound;
1011 float scale = 1.0f / rscale;
1012
1013 float scaled_low_bound = low_bound * scale;
1014 rscale *= 1.0f / 64.0f;
1015
1016 vfloat scalev(scale);
1017 vfloat scaled_low_boundv(scaled_low_bound);
1018 vfloat quant_level_m1v(quant_level_m1);
1019 vfloat rscalev(rscale);
1020 vfloat low_boundv(low_bound);
1021
1022 // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
1023 // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
1024 if (get_quant_level(quant_level) <= 16)
1025 {
1026 vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
1027 vint tab0p;
1028 vtable_prepare(tab0, tab0p);
1029
1030 for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1031 {
1032 vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1033 ix = clampzo(ix);
1034
1035 // Look up the two closest indexes and return the one that was closest
1036 vfloat ix1 = ix * quant_level_m1v;
1037
1038 vint weightl = float_to_int(ix1);
1039 vint weighth = min(weightl + vint(1), steps_m1);
1040
1041 vint ixli = vtable_8bt_32bi(tab0p, weightl);
1042 vint ixhi = vtable_8bt_32bi(tab0p, weighth);
1043
1044 vfloat ixl = int_to_float(ixli);
1045 vfloat ixh = int_to_float(ixhi);
1046
1047 vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
1048 vint weight = select(ixli, ixhi, mask);
1049 ixl = select(ixl, ixh, mask);
1050
1051 // Invert the weight-scaling that was done initially
1052 storea(ixl * rscalev + low_boundv, weight_set_out + i);
1053 vint scn = pack_low_bytes(weight);
1054 store_nbytes(scn, quantized_weight_set + i);
1055 }
1056 }
1057 else
1058 {
1059 vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
1060 vint4 tab1(reinterpret_cast<const int*>(qat.quant_to_unquant + 16));
1061 vint tab0p, tab1p;
1062 vtable_prepare(tab0, tab1, tab0p, tab1p);
1063
1064 for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1065 {
1066 vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1067 ix = clampzo(ix);
1068
1069 // Look up the two closest indexes and return the one that was closest
1070 vfloat ix1 = ix * quant_level_m1v;
1071
1072 vint weightl = float_to_int(ix1);
1073 vint weighth = min(weightl + vint(1), steps_m1);
1074
1075 vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
1076 vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
1077
1078 vfloat ixl = int_to_float(ixli);
1079 vfloat ixh = int_to_float(ixhi);
1080
1081 vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
1082 vint weight = select(ixli, ixhi, mask);
1083 ixl = select(ixl, ixh, mask);
1084
1085 // Invert the weight-scaling that was done initially
1086 storea(ixl * rscalev + low_boundv, weight_set_out + i);
1087 vint scn = pack_low_bytes(weight);
1088 store_nbytes(scn, quantized_weight_set + i);
1089 }
1090 }
1091 }
1092
1093 /**
1094 * @brief Compute the RGB + offset for a HDR endpoint mode #7.
1095 *
1096 * Since the matrix needed has a regular structure we can simplify the inverse calculation. This
1097 * gives us ~24 multiplications vs. 96 for a generic inverse.
1098 *
1099 * mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x);
1100 * mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y);
1101 * mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z);
1102 * mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum);
1103 * mat = invert(mat);
1104 *
1105 * @param rgba_weight_sum Sum of partition component error weights.
1106 * @param weight_weight_sum Sum of partition component error weights * texel weight.
1107 * @param rgbq_sum Sum of partition component error weights * texel weight * color data.
1108 * @param psum Sum of RGB color weights * texel weight^2.
1109 */
compute_rgbo_vector(vfloat4 rgba_weight_sum,vfloat4 weight_weight_sum,vfloat4 rgbq_sum,float psum)1110 static inline vfloat4 compute_rgbo_vector(
1111 vfloat4 rgba_weight_sum,
1112 vfloat4 weight_weight_sum,
1113 vfloat4 rgbq_sum,
1114 float psum
1115 ) {
1116 float X = rgba_weight_sum.lane<0>();
1117 float Y = rgba_weight_sum.lane<1>();
1118 float Z = rgba_weight_sum.lane<2>();
1119 float P = weight_weight_sum.lane<0>();
1120 float Q = weight_weight_sum.lane<1>();
1121 float R = weight_weight_sum.lane<2>();
1122 float S = psum;
1123
1124 float PP = P * P;
1125 float QQ = Q * Q;
1126 float RR = R * R;
1127
1128 float SZmRR = S * Z - RR;
1129 float DT = SZmRR * Y - Z * QQ;
1130 float YP = Y * P;
1131 float QX = Q * X;
1132 float YX = Y * X;
1133 float mZYP = -Z * YP;
1134 float mZQX = -Z * QX;
1135 float mRYX = -R * YX;
1136 float ZQP = Z * Q * P;
1137 float RYP = R * YP;
1138 float RQX = R * QX;
1139
1140 // Compute the reciprocal of matrix determinant
1141 float rdet = 1.0f / (DT * X + mZYP * P);
1142
1143 // Actually compute the adjugate, and then apply 1/det separately
1144 vfloat4 mat0(DT, ZQP, RYP, mZYP);
1145 vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
1146 vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
1147 vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
1148 vfloat4 vect = rgbq_sum * rdet;
1149
1150 return vfloat4(dot_s(mat0, vect),
1151 dot_s(mat1, vect),
1152 dot_s(mat2, vect),
1153 dot_s(mat3, vect));
1154 }
1155
1156 /* See header for documentation. */
recompute_ideal_colors_1plane(const image_block & blk,const partition_info & pi,const decimation_info & di,const uint8_t * dec_weights_uquant,endpoints & ep,vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS])1157 void recompute_ideal_colors_1plane(
1158 const image_block& blk,
1159 const partition_info& pi,
1160 const decimation_info& di,
1161 const uint8_t* dec_weights_uquant,
1162 endpoints& ep,
1163 vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
1164 vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
1165 ) {
1166 unsigned int weight_count = di.weight_count;
1167 unsigned int total_texel_count = blk.texel_count;
1168 unsigned int partition_count = pi.partition_count;
1169
1170 promise(weight_count > 0);
1171 promise(total_texel_count > 0);
1172 promise(partition_count > 0);
1173
1174 alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS];
1175 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1176 {
1177 vint unquant_value(dec_weights_uquant + i);
1178 vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
1179 storea(unquant_valuef, dec_weight + i);
1180 }
1181
1182 alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
1183 float* undec_weight_ref;
1184 if (di.max_texel_weight_count == 1)
1185 {
1186 undec_weight_ref = dec_weight;
1187 }
1188 else if (di.max_texel_weight_count <= 2)
1189 {
1190 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1191 {
1192 vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
1193 storea(weight, undec_weight + i);
1194 }
1195
1196 undec_weight_ref = undec_weight;
1197 }
1198 else
1199 {
1200 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1201 {
1202 vfloat weight = bilinear_infill_vla(di, dec_weight, i);
1203 storea(weight, undec_weight + i);
1204 }
1205
1206 undec_weight_ref = undec_weight;
1207 }
1208
1209 vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
1210
1211 for (unsigned int i = 0; i < partition_count; i++)
1212 {
1213 unsigned int texel_count = pi.partition_texel_count[i];
1214 const uint8_t *texel_indexes = pi.texels_of_partition[i];
1215
1216 // Only compute a partition mean if more than one partition
1217 if (partition_count > 1)
1218 {
1219 rgba_sum = vfloat4(1e-17f);
1220 promise(texel_count > 0);
1221 for (unsigned int j = 0; j < texel_count; j++)
1222 {
1223 unsigned int tix = texel_indexes[j];
1224 rgba_sum += blk.texel(tix);
1225 }
1226 }
1227
1228 rgba_sum = rgba_sum * blk.channel_weight;
1229 vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1230 vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
1231
1232 float scale_max = 0.0f;
1233 float scale_min = 1e10f;
1234
1235 float wmin1 = 1.0f;
1236 float wmax1 = 0.0f;
1237
1238 float left_sum_s = 0.0f;
1239 float middle_sum_s = 0.0f;
1240 float right_sum_s = 0.0f;
1241
1242 vfloat4 color_vec_x = vfloat4::zero();
1243 vfloat4 color_vec_y = vfloat4::zero();
1244
1245 vfloat4 scale_vec = vfloat4::zero();
1246
1247 float weight_weight_sum_s = 1e-17f;
1248
1249 vfloat4 color_weight = blk.channel_weight;
1250 float ls_weight = hadd_rgb_s(color_weight);
1251
1252 for (unsigned int j = 0; j < texel_count; j++)
1253 {
1254 unsigned int tix = texel_indexes[j];
1255
1256 vfloat4 rgba = blk.texel(tix);
1257
1258 float idx0 = undec_weight_ref[tix];
1259
1260 float om_idx0 = 1.0f - idx0;
1261 wmin1 = astc::min(idx0, wmin1);
1262 wmax1 = astc::max(idx0, wmax1);
1263
1264 float scale = dot3_s(scale_dir, rgba);
1265 scale_min = astc::min(scale, scale_min);
1266 scale_max = astc::max(scale, scale_max);
1267
1268 left_sum_s += om_idx0 * om_idx0;
1269 middle_sum_s += om_idx0 * idx0;
1270 right_sum_s += idx0 * idx0;
1271 weight_weight_sum_s += idx0;
1272
1273 vfloat4 color_idx(idx0);
1274 vfloat4 cwprod = rgba;
1275 vfloat4 cwiprod = cwprod * color_idx;
1276
1277 color_vec_y += cwiprod;
1278 color_vec_x += cwprod - cwiprod;
1279
1280 scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
1281 }
1282
1283 vfloat4 left_sum = vfloat4(left_sum_s) * color_weight;
1284 vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
1285 vfloat4 right_sum = vfloat4(right_sum_s) * color_weight;
1286 vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
1287
1288 vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
1289 float psum = right_sum_s * hadd_rgb_s(color_weight);
1290
1291 color_vec_x = color_vec_x * color_weight;
1292 color_vec_y = color_vec_y * color_weight;
1293
1294 // Initialize the luminance and scale vectors with a reasonable default
1295 float scalediv = scale_min / astc::max(scale_max, 1e-10f);
1296 scalediv = astc::clamp1f(scalediv);
1297
1298 vfloat4 sds = scale_dir * scale_max;
1299
1300 rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1301
1302 if (wmin1 >= wmax1 * 0.999f)
1303 {
1304 // If all weights in the partition were equal, then just take average of all colors in
1305 // the partition and use that as both endpoint colors
1306 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1307
1308 vmask4 notnan_mask = avg == avg;
1309 ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
1310 ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
1311
1312 rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1313 }
1314 else
1315 {
1316 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1317 // set of texel weights and pixel colors
1318 vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
1319 vfloat4 color_rdet1 = 1.0f / color_det1;
1320
1321 float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1322 float ls_rdet1 = 1.0f / ls_det1;
1323
1324 vfloat4 color_mss1 = (left_sum * left_sum)
1325 + (2.0f * middle_sum * middle_sum)
1326 + (right_sum * right_sum);
1327
1328 float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1329 + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1330 + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1331
1332 vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
1333 vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
1334
1335 vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1336 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1337 vmask4 full_mask = det_mask & notnan_mask;
1338
1339 ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
1340 ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
1341
1342 float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1343 float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1344
1345 if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1346 {
1347 float scalediv2 = scale_ep0 / scale_ep1;
1348 vfloat4 sdsm = scale_dir * scale_ep1;
1349 rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1350 }
1351 }
1352
1353 // Calculations specific to mode #7, the HDR RGB-scale mode
1354 vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1355 rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1356
1357 vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1358 rgbo_vectors[i] = rgbovec;
1359
1360 // We can get a failure due to the use of a singular (non-invertible) matrix
1361 // If it failed, compute rgbo_vectors[] with a different method ...
1362 if (astc::isnan(dot_s(rgbovec, rgbovec)))
1363 {
1364 vfloat4 v0 = ep.endpt0[i];
1365 vfloat4 v1 = ep.endpt1[i];
1366
1367 float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1368 avgdif = astc::max(avgdif, 0.0f);
1369
1370 vfloat4 avg = (v0 + v1) * 0.5f;
1371 vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1372 rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1373 }
1374 }
1375 }
1376
1377 /* See header for documentation. */
recompute_ideal_colors_2planes(const image_block & blk,const block_size_descriptor & bsd,const decimation_info & di,const uint8_t * dec_weights_uquant_plane1,const uint8_t * dec_weights_uquant_plane2,endpoints & ep,vfloat4 & rgbs_vector,vfloat4 & rgbo_vector,int plane2_component)1378 void recompute_ideal_colors_2planes(
1379 const image_block& blk,
1380 const block_size_descriptor& bsd,
1381 const decimation_info& di,
1382 const uint8_t* dec_weights_uquant_plane1,
1383 const uint8_t* dec_weights_uquant_plane2,
1384 endpoints& ep,
1385 vfloat4& rgbs_vector,
1386 vfloat4& rgbo_vector,
1387 int plane2_component
1388 ) {
1389 unsigned int weight_count = di.weight_count;
1390 unsigned int total_texel_count = blk.texel_count;
1391
1392 promise(total_texel_count > 0);
1393 promise(weight_count > 0);
1394
1395 alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
1396 alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
1397
1398 assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
1399
1400 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1401 {
1402 vint unquant_value1(dec_weights_uquant_plane1 + i);
1403 vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
1404 storea(unquant_value1f, dec_weight_plane1 + i);
1405
1406 vint unquant_value2(dec_weights_uquant_plane2 + i);
1407 vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
1408 storea(unquant_value2f, dec_weight_plane2 + i);
1409 }
1410
1411 alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
1412 alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
1413
1414 float* undec_weight_plane1_ref;
1415 float* undec_weight_plane2_ref;
1416
1417 if (di.max_texel_weight_count == 1)
1418 {
1419 undec_weight_plane1_ref = dec_weight_plane1;
1420 undec_weight_plane2_ref = dec_weight_plane2;
1421 }
1422 else if (di.max_texel_weight_count <= 2)
1423 {
1424 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1425 {
1426 vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
1427 storea(weight, undec_weight_plane1 + i);
1428
1429 weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
1430 storea(weight, undec_weight_plane2 + i);
1431 }
1432
1433 undec_weight_plane1_ref = undec_weight_plane1;
1434 undec_weight_plane2_ref = undec_weight_plane2;
1435 }
1436 else
1437 {
1438 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1439 {
1440 vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
1441 storea(weight, undec_weight_plane1 + i);
1442
1443 weight = bilinear_infill_vla(di, dec_weight_plane2, i);
1444 storea(weight, undec_weight_plane2 + i);
1445 }
1446
1447 undec_weight_plane1_ref = undec_weight_plane1;
1448 undec_weight_plane2_ref = undec_weight_plane2;
1449 }
1450
1451 unsigned int texel_count = bsd.texel_count;
1452 vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1453 vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
1454
1455 float scale_max = 0.0f;
1456 float scale_min = 1e10f;
1457
1458 float wmin1 = 1.0f;
1459 float wmax1 = 0.0f;
1460
1461 float wmin2 = 1.0f;
1462 float wmax2 = 0.0f;
1463
1464 float left1_sum_s = 0.0f;
1465 float middle1_sum_s = 0.0f;
1466 float right1_sum_s = 0.0f;
1467
1468 float left2_sum_s = 0.0f;
1469 float middle2_sum_s = 0.0f;
1470 float right2_sum_s = 0.0f;
1471
1472 vfloat4 color_vec_x = vfloat4::zero();
1473 vfloat4 color_vec_y = vfloat4::zero();
1474
1475 vfloat4 scale_vec = vfloat4::zero();
1476
1477 vfloat4 weight_weight_sum = vfloat4(1e-17f);
1478
1479 vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
1480 vfloat4 color_weight = blk.channel_weight;
1481 float ls_weight = hadd_rgb_s(color_weight);
1482
1483 for (unsigned int j = 0; j < texel_count; j++)
1484 {
1485 vfloat4 rgba = blk.texel(j);
1486
1487 float idx0 = undec_weight_plane1_ref[j];
1488
1489 float om_idx0 = 1.0f - idx0;
1490 wmin1 = astc::min(idx0, wmin1);
1491 wmax1 = astc::max(idx0, wmax1);
1492
1493 float scale = dot3_s(scale_dir, rgba);
1494 scale_min = astc::min(scale, scale_min);
1495 scale_max = astc::max(scale, scale_max);
1496
1497 left1_sum_s += om_idx0 * om_idx0;
1498 middle1_sum_s += om_idx0 * idx0;
1499 right1_sum_s += idx0 * idx0;
1500
1501 float idx1 = undec_weight_plane2_ref[j];
1502
1503 float om_idx1 = 1.0f - idx1;
1504 wmin2 = astc::min(idx1, wmin2);
1505 wmax2 = astc::max(idx1, wmax2);
1506
1507 left2_sum_s += om_idx1 * om_idx1;
1508 middle2_sum_s += om_idx1 * idx1;
1509 right2_sum_s += idx1 * idx1;
1510
1511 vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
1512
1513 vfloat4 cwprod = rgba;
1514 vfloat4 cwiprod = cwprod * color_idx;
1515
1516 color_vec_y += cwiprod;
1517 color_vec_x += cwprod - cwiprod;
1518
1519 scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
1520 weight_weight_sum += (color_weight * color_idx);
1521 }
1522
1523 vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight;
1524 vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
1525 vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight;
1526 vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
1527
1528 vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight;
1529 vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
1530 vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight;
1531
1532 float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
1533
1534 color_vec_x = color_vec_x * color_weight;
1535 color_vec_y = color_vec_y * color_weight;
1536
1537 // Initialize the luminance and scale vectors with a reasonable default
1538 float scalediv = scale_min / astc::max(scale_max, 1e-10f);
1539 scalediv = astc::clamp1f(scalediv);
1540
1541 vfloat4 sds = scale_dir * scale_max;
1542
1543 rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1544
1545 if (wmin1 >= wmax1 * 0.999f)
1546 {
1547 // If all weights in the partition were equal, then just take average of all colors in
1548 // the partition and use that as both endpoint colors
1549 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1550
1551 vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1552 vmask4 notnan_mask = avg == avg;
1553 vmask4 full_mask = p1_mask & notnan_mask;
1554
1555 ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1556 ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1557
1558 rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1559 }
1560 else
1561 {
1562 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1563 // set of texel weights and pixel colors
1564 vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
1565 vfloat4 color_rdet1 = 1.0f / color_det1;
1566
1567 float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1568 float ls_rdet1 = 1.0f / ls_det1;
1569
1570 vfloat4 color_mss1 = (left1_sum * left1_sum)
1571 + (2.0f * middle1_sum * middle1_sum)
1572 + (right1_sum * right1_sum);
1573
1574 float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1575 + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1576 + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1577
1578 vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
1579 vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
1580
1581 float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1582 float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1583
1584 vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1585 vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1586 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1587 vmask4 full_mask = p1_mask & det_mask & notnan_mask;
1588
1589 ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1590 ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1591
1592 if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1593 {
1594 float scalediv2 = scale_ep0 / scale_ep1;
1595 vfloat4 sdsm = scale_dir * scale_ep1;
1596 rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1597 }
1598 }
1599
1600 if (wmin2 >= wmax2 * 0.999f)
1601 {
1602 // If all weights in the partition were equal, then just take average of all colors in
1603 // the partition and use that as both endpoint colors
1604 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1605
1606 vmask4 notnan_mask = avg == avg;
1607 vmask4 full_mask = p2_mask & notnan_mask;
1608
1609 ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1610 ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1611 }
1612 else
1613 {
1614 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1615 // set of texel weights and pixel colors
1616 vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
1617 vfloat4 color_rdet2 = 1.0f / color_det2;
1618
1619 vfloat4 color_mss2 = (left2_sum * left2_sum)
1620 + (2.0f * middle2_sum * middle2_sum)
1621 + (right2_sum * right2_sum);
1622
1623 vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
1624 vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
1625
1626 vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
1627 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1628 vmask4 full_mask = p2_mask & det_mask & notnan_mask;
1629
1630 ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1631 ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1632 }
1633
1634 // Calculations specific to mode #7, the HDR RGB-scale mode
1635 vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1636 rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1637
1638 rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1639
1640 // We can get a failure due to the use of a singular (non-invertible) matrix
1641 // If it failed, compute rgbo_vectors[] with a different method ...
1642 if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
1643 {
1644 vfloat4 v0 = ep.endpt0[0];
1645 vfloat4 v1 = ep.endpt1[0];
1646
1647 float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1648 avgdif = astc::max(avgdif, 0.0f);
1649
1650 vfloat4 avg = (v0 + v1) * 0.5f;
1651 vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1652
1653 rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1654 }
1655 }
1656
1657 #endif
1658