1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Functions to decompress a symbolic block.
20  */
21 
22 #include "astcenc_internal.h"
23 
24 #include <stdio.h>
25 #include <assert.h>
26 
27 /**
28  * @brief Compute the integer linear interpolation of two color endpoints.
29  *
30  * @param decode_mode   The ASTC profile (linear or sRGB)
31  * @param color0        The endpoint0 color.
32  * @param color1        The endpoint1 color.
33  * @param weights        The interpolation weight (between 0 and 64).
34  *
35  * @return The interpolated color.
36  */
lerp_color_int(astcenc_profile decode_mode,vint4 color0,vint4 color1,vint4 weights)37 static vint4 lerp_color_int(
38 	astcenc_profile decode_mode,
39 	vint4 color0,
40 	vint4 color1,
41 	vint4 weights
42 ) {
43 	vint4 weight1 = weights;
44 	vint4 weight0 = vint4(64) - weight1;
45 
46 	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
47 	{
48 		color0 = asr<8>(color0);
49 		color1 = asr<8>(color1);
50 	}
51 
52 	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
53 	color = asr<6>(color);
54 
55 	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
56 	{
57 		color = color * vint4(257);
58 	}
59 
60 	return color;
61 }
62 
63 
64 /**
65  * @brief Convert integer color value into a float value for the decoder.
66  *
67  * @param data       The integer color value post-interpolation.
68  * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
69  *
70  * @return The float color value.
71  */
decode_texel(vint4 data,vmask4 lns_mask)72 static inline vfloat4 decode_texel(
73 	vint4 data,
74 	vmask4 lns_mask
75 ) {
76 	vint4 color_lns = vint4::zero();
77 	vint4 color_unorm = vint4::zero();
78 
79 	if (any(lns_mask))
80 	{
81 		color_lns = lns_to_sf16(data);
82 	}
83 
84 	if (!all(lns_mask))
85 	{
86 		color_unorm = unorm16_to_sf16(data);
87 	}
88 
89 	// Pick components and then convert to FP16
90 	vint4 datai = select(color_unorm, color_lns, lns_mask);
91 	return float16_to_float(datai);
92 }
93 
94 /* See header for documentation. */
unpack_weights(const block_size_descriptor & bsd,const symbolic_compressed_block & scb,const decimation_info & di,bool is_dual_plane,int weights_plane1[BLOCK_MAX_TEXELS],int weights_plane2[BLOCK_MAX_TEXELS])95 void unpack_weights(
96 	const block_size_descriptor& bsd,
97 	const symbolic_compressed_block& scb,
98 	const decimation_info& di,
99 	bool is_dual_plane,
100 	int weights_plane1[BLOCK_MAX_TEXELS],
101 	int weights_plane2[BLOCK_MAX_TEXELS]
102 ) {
103 	// Safe to overshoot as all arrays are allocated to full size
104 	if (!is_dual_plane)
105 	{
106 		// Build full 64-entry weight lookup table
107 		vint4 tab0(reinterpret_cast<const int*>(scb.weights +  0));
108 		vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16));
109 		vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32));
110 		vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48));
111 
112 		vint tab0p, tab1p, tab2p, tab3p;
113 		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
114 
115 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
116 		{
117 			vint summed_value(8);
118 			vint weight_count(di.texel_weight_count + i);
119 			int max_weight_count = hmax(weight_count).lane<0>();
120 
121 			promise(max_weight_count > 0);
122 			for (int j = 0; j < max_weight_count; j++)
123 			{
124 				vint texel_weights(di.texel_weights_4t[j] + i);
125 				vint texel_weights_int(di.texel_weights_int_4t[j] + i);
126 
127 				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
128 			}
129 
130 			store(lsr<4>(summed_value), weights_plane1 + i);
131 		}
132 	}
133 	else
134 	{
135 		// Build a 32-entry weight lookup table per plane
136 		// Plane 1
137 		vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights +  0));
138 		vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16));
139 		vint tab0_plane1p, tab1_plane1p;
140 		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
141 
142 		// Plane 2
143 		vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32));
144 		vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48));
145 		vint tab0_plane2p, tab1_plane2p;
146 		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
147 
148 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
149 		{
150 			vint sum_plane1(8);
151 			vint sum_plane2(8);
152 
153 			vint weight_count(di.texel_weight_count + i);
154 			int max_weight_count = hmax(weight_count).lane<0>();
155 
156 			promise(max_weight_count > 0);
157 			for (int j = 0; j < max_weight_count; j++)
158 			{
159 				vint texel_weights(di.texel_weights_4t[j] + i);
160 				vint texel_weights_int(di.texel_weights_int_4t[j] + i);
161 
162 				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
163 				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
164 			}
165 
166 			store(lsr<4>(sum_plane1), weights_plane1 + i);
167 			store(lsr<4>(sum_plane2), weights_plane2 + i);
168 		}
169 	}
170 }
171 
172 /**
173  * @brief Return an FP32 NaN value for use in error colors.
174  *
175  * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
176  *
177  * @return The float color value.
178  */
error_color_nan()179 static float error_color_nan()
180 {
181 	if32 v;
182 	v.u = 0xFFFFE000U;
183 	return v.f;
184 }
185 
186 /* See header for documentation. */
decompress_symbolic_block(astcenc_profile decode_mode,const block_size_descriptor & bsd,int xpos,int ypos,int zpos,const symbolic_compressed_block & scb,image_block & blk)187 void decompress_symbolic_block(
188 	astcenc_profile decode_mode,
189 	const block_size_descriptor& bsd,
190 	int xpos,
191 	int ypos,
192 	int zpos,
193 	const symbolic_compressed_block& scb,
194 	image_block& blk
195 ) {
196 	blk.xpos = xpos;
197 	blk.ypos = ypos;
198 	blk.zpos = zpos;
199 
200 	blk.data_min = vfloat4::zero();
201 	blk.data_mean = vfloat4::zero();
202 	blk.data_max = vfloat4::zero();
203 	blk.grayscale = false;
204 
205 	// If we detected an error-block, blow up immediately.
206 	if (scb.block_type == SYM_BTYPE_ERROR)
207 	{
208 		for (unsigned int i = 0; i < bsd.texel_count; i++)
209 		{
210 			blk.data_r[i] = error_color_nan();
211 			blk.data_g[i] = error_color_nan();
212 			blk.data_b[i] = error_color_nan();
213 			blk.data_a[i] = error_color_nan();
214 			blk.rgb_lns[i] = 0;
215 			blk.alpha_lns[i] = 0;
216 		}
217 
218 		return;
219 	}
220 
221 	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
222 	    (scb.block_type == SYM_BTYPE_CONST_U16))
223 	{
224 		vfloat4 color;
225 		uint8_t use_lns = 0;
226 
227 		// UNORM16 constant color block
228 		if (scb.block_type == SYM_BTYPE_CONST_U16)
229 		{
230 			vint4 colori(scb.constant_color);
231 
232 			// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
233 			// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
234 			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
235 			{
236 				colori = asr<8>(colori) * 257;
237 			}
238 
239 			vint4 colorf16 = unorm16_to_sf16(colori);
240 			color = float16_to_float(colorf16);
241 		}
242 		// FLOAT16 constant color block
243 		else
244 		{
245 			switch (decode_mode)
246 			{
247 			case ASTCENC_PRF_LDR_SRGB:
248 			case ASTCENC_PRF_LDR:
249 				color = vfloat4(error_color_nan());
250 				break;
251 			case ASTCENC_PRF_HDR_RGB_LDR_A:
252 			case ASTCENC_PRF_HDR:
253 				// Constant-color block; unpack from FP16 to FP32.
254 				color = float16_to_float(vint4(scb.constant_color));
255 				use_lns = 1;
256 				break;
257 			}
258 		}
259 
260 		for (unsigned int i = 0; i < bsd.texel_count; i++)
261 		{
262 			blk.data_r[i] = color.lane<0>();
263 			blk.data_g[i] = color.lane<1>();
264 			blk.data_b[i] = color.lane<2>();
265 			blk.data_a[i] = color.lane<3>();
266 			blk.rgb_lns[i] = use_lns;
267 			blk.alpha_lns[i] = use_lns;
268 		}
269 
270 		return;
271 	}
272 
273 	// Get the appropriate partition-table entry
274 	int partition_count = scb.partition_count;
275 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
276 
277 	// Get the appropriate block descriptors
278 	const auto& bm = bsd.get_block_mode(scb.block_mode);
279 	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
280 
281 	bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
282 
283 	// Unquantize and undecimate the weights
284 	int plane1_weights[BLOCK_MAX_TEXELS];
285 	int plane2_weights[BLOCK_MAX_TEXELS];
286 	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
287 
288 	// Now that we have endpoint colors and weights, we can unpack texel colors
289 	int plane2_component = is_dual_plane ? scb.plane2_component : -1;
290 	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
291 
292 	for (int i = 0; i < partition_count; i++)
293 	{
294 		// Decode the color endpoints for this partition
295 		vint4 ep0;
296 		vint4 ep1;
297 		bool rgb_lns;
298 		bool a_lns;
299 
300 		unpack_color_endpoints(decode_mode,
301 		                       scb.color_formats[i],
302 		                       scb.get_color_quant_mode(),
303 		                       scb.color_values[i],
304 		                       rgb_lns, a_lns,
305 		                       ep0, ep1);
306 
307 		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
308 
309 		int texel_count = pi.partition_texel_count[i];
310 		for (int j = 0; j < texel_count; j++)
311 		{
312 			int tix = pi.texels_of_partition[i][j];
313 			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
314 			vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
315 			vfloat4 colorf = decode_texel(color, lns_mask);
316 
317 			blk.data_r[tix] = colorf.lane<0>();
318 			blk.data_g[tix] = colorf.lane<1>();
319 			blk.data_b[tix] = colorf.lane<2>();
320 			blk.data_a[tix] = colorf.lane<3>();
321 		}
322 	}
323 }
324 
325 #if !defined(ASTCENC_DECOMPRESS_ONLY)
326 
327 /* See header for documentation. */
compute_symbolic_block_difference_2plane(const astcenc_config & config,const block_size_descriptor & bsd,const symbolic_compressed_block & scb,const image_block & blk)328 float compute_symbolic_block_difference_2plane(
329 	const astcenc_config& config,
330 	const block_size_descriptor& bsd,
331 	const symbolic_compressed_block& scb,
332 	const image_block& blk
333 ) {
334 	// If we detected an error-block, blow up immediately.
335 	if (scb.block_type == SYM_BTYPE_ERROR)
336 	{
337 		return ERROR_CALC_DEFAULT;
338 	}
339 
340 	assert(scb.block_mode >= 0);
341 	assert(scb.partition_count == 1);
342 	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
343 
344 	// Get the appropriate block descriptor
345 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
346 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
347 
348 	// Unquantize and undecimate the weights
349 	int plane1_weights[BLOCK_MAX_TEXELS];
350 	int plane2_weights[BLOCK_MAX_TEXELS];
351 	unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
352 
353 	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
354 
355 	vfloat4 summa = vfloat4::zero();
356 
357 	// Decode the color endpoints for this partition
358 	vint4 ep0;
359 	vint4 ep1;
360 	bool rgb_lns;
361 	bool a_lns;
362 
363 	unpack_color_endpoints(config.profile,
364 	                       scb.color_formats[0],
365 	                       scb.get_color_quant_mode(),
366 	                       scb.color_values[0],
367 	                       rgb_lns, a_lns,
368 	                       ep0, ep1);
369 
370 	// Unpack and compute error for each texel in the partition
371 	unsigned int texel_count = bsd.texel_count;
372 	for (unsigned int i = 0; i < texel_count; i++)
373 	{
374 		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
375 		vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
376 
377 		vfloat4 color = int_to_float(colori);
378 		vfloat4 oldColor = blk.texel(i);
379 
380 		// Compare error using a perceptual decode metric for RGBM textures
381 		if (config.flags & ASTCENC_FLG_MAP_RGBM)
382 		{
383 			// Fail encodings that result in zero weight M pixels. Note that this can cause
384 			// "interesting" artifacts if we reject all useful encodings - we typically get max
385 			// brightness encodings instead which look just as bad. We recommend users apply a
386 			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
387 			// getting small M values post-quantization, but we can't prove it would never
388 			// happen, especially at low bit rates ...
389 			if (color.lane<3>() == 0.0f)
390 			{
391 				return -ERROR_CALC_DEFAULT;
392 			}
393 
394 			// Compute error based on decoded RGBM color
395 			color = vfloat4(
396 				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
397 				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
398 				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
399 				1.0f
400 			);
401 
402 			oldColor = vfloat4(
403 				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
404 				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
405 				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
406 				1.0f
407 			);
408 		}
409 
410 		vfloat4 error = oldColor - color;
411 		error = min(abs(error), 1e15f);
412 		error = error * error;
413 
414 		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
415 	}
416 
417 	return summa.lane<0>();
418 }
419 
420 /* See header for documentation. */
compute_symbolic_block_difference_1plane(const astcenc_config & config,const block_size_descriptor & bsd,const symbolic_compressed_block & scb,const image_block & blk)421 float compute_symbolic_block_difference_1plane(
422 	const astcenc_config& config,
423 	const block_size_descriptor& bsd,
424 	const symbolic_compressed_block& scb,
425 	const image_block& blk
426 ) {
427 	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
428 
429 	// If we detected an error-block, blow up immediately.
430 	if (scb.block_type == SYM_BTYPE_ERROR)
431 	{
432 		return ERROR_CALC_DEFAULT;
433 	}
434 
435 	assert(scb.block_mode >= 0);
436 
437 	// Get the appropriate partition-table entry
438 	unsigned int partition_count = scb.partition_count;
439 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
440 
441 	// Get the appropriate block descriptor
442 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
443 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
444 
445 	// Unquantize and undecimate the weights
446 	int plane1_weights[BLOCK_MAX_TEXELS];
447 	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
448 
449 	vfloat4 summa = vfloat4::zero();
450 	for (unsigned int i = 0; i < partition_count; i++)
451 	{
452 		// Decode the color endpoints for this partition
453 		vint4 ep0;
454 		vint4 ep1;
455 		bool rgb_lns;
456 		bool a_lns;
457 
458 		unpack_color_endpoints(config.profile,
459 		                       scb.color_formats[i],
460 		                       scb.get_color_quant_mode(),
461 		                       scb.color_values[i],
462 		                       rgb_lns, a_lns,
463 		                       ep0, ep1);
464 
465 		// Unpack and compute error for each texel in the partition
466 		unsigned int texel_count = pi.partition_texel_count[i];
467 		for (unsigned int j = 0; j < texel_count; j++)
468 		{
469 			unsigned int tix = pi.texels_of_partition[i][j];
470 			vint4 colori = lerp_color_int(config.profile, ep0, ep1,
471 			                              vint4(plane1_weights[tix]));
472 
473 			vfloat4 color = int_to_float(colori);
474 			vfloat4 oldColor = blk.texel(tix);
475 
476 			// Compare error using a perceptual decode metric for RGBM textures
477 			if (config.flags & ASTCENC_FLG_MAP_RGBM)
478 			{
479 				// Fail encodings that result in zero weight M pixels. Note that this can cause
480 				// "interesting" artifacts if we reject all useful encodings - we typically get max
481 				// brightness encodings instead which look just as bad. We recommend users apply a
482 				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
483 				// getting small M values post-quantization, but we can't prove it would never
484 				// happen, especially at low bit rates ...
485 				if (color.lane<3>() == 0.0f)
486 				{
487 					return -ERROR_CALC_DEFAULT;
488 				}
489 
490 				// Compute error based on decoded RGBM color
491 				color = vfloat4(
492 					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
493 					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
494 					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
495 					1.0f
496 				);
497 
498 				oldColor = vfloat4(
499 					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
500 					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
501 					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
502 					1.0f
503 				);
504 			}
505 
506 			vfloat4 error = oldColor - color;
507 			error = min(abs(error), 1e15f);
508 			error = error * error;
509 
510 			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
511 		}
512 	}
513 
514 	return summa.lane<0>();
515 }
516 
517 /* See header for documentation. */
compute_symbolic_block_difference_1plane_1partition(const astcenc_config & config,const block_size_descriptor & bsd,const symbolic_compressed_block & scb,const image_block & blk)518 float compute_symbolic_block_difference_1plane_1partition(
519 	const astcenc_config& config,
520 	const block_size_descriptor& bsd,
521 	const symbolic_compressed_block& scb,
522 	const image_block& blk
523 ) {
524 	// If we detected an error-block, blow up immediately.
525 	if (scb.block_type == SYM_BTYPE_ERROR)
526 	{
527 		return ERROR_CALC_DEFAULT;
528 	}
529 
530 	assert(scb.block_mode >= 0);
531 	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
532 
533 	// Get the appropriate block descriptor
534 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
535 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
536 
537 	// Unquantize and undecimate the weights
538 	alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
539 	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
540 
541 	// Decode the color endpoints for this partition
542 	vint4 ep0;
543 	vint4 ep1;
544 	bool rgb_lns;
545 	bool a_lns;
546 
547 	unpack_color_endpoints(config.profile,
548 	                       scb.color_formats[0],
549 	                       scb.get_color_quant_mode(),
550 	                       scb.color_values[0],
551 	                       rgb_lns, a_lns,
552 	                       ep0, ep1);
553 
554 
555 	// Pre-shift sRGB so things round correctly
556 	if (config.profile == ASTCENC_PRF_LDR_SRGB)
557 	{
558 		ep0 = asr<8>(ep0);
559 		ep1 = asr<8>(ep1);
560 	}
561 
562 	// Unpack and compute error for each texel in the partition
563 	vfloatacc summav = vfloatacc::zero();
564 
565 	vint lane_id = vint::lane_id();
566 	vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
567 
568 	unsigned int texel_count = bsd.texel_count;
569 	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
570 	{
571 		// Compute EP1 contribution
572 		vint weight1 = vint::loada(plane1_weights + i);
573 		vint ep1_r = vint(ep1.lane<0>()) * weight1;
574 		vint ep1_g = vint(ep1.lane<1>()) * weight1;
575 		vint ep1_b = vint(ep1.lane<2>()) * weight1;
576 		vint ep1_a = vint(ep1.lane<3>()) * weight1;
577 
578 		// Compute EP0 contribution
579 		vint weight0 = vint(64) - weight1;
580 		vint ep0_r = vint(ep0.lane<0>()) * weight0;
581 		vint ep0_g = vint(ep0.lane<1>()) * weight0;
582 		vint ep0_b = vint(ep0.lane<2>()) * weight0;
583 		vint ep0_a = vint(ep0.lane<3>()) * weight0;
584 
585 		// Shift so things round correctly
586 		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
587 		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
588 		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
589 		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
590 
591 		// Compute color diff
592 		vfloat color_r = int_to_float(colori_r);
593 		vfloat color_g = int_to_float(colori_g);
594 		vfloat color_b = int_to_float(colori_b);
595 		vfloat color_a = int_to_float(colori_a);
596 
597 		vfloat color_orig_r = loada(blk.data_r + i);
598 		vfloat color_orig_g = loada(blk.data_g + i);
599 		vfloat color_orig_b = loada(blk.data_b + i);
600 		vfloat color_orig_a = loada(blk.data_a + i);
601 
602 		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
603 		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
604 		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
605 		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
606 
607 		// Compute squared error metric
608 		color_error_r = color_error_r * color_error_r;
609 		color_error_g = color_error_g * color_error_g;
610 		color_error_b = color_error_b * color_error_b;
611 		color_error_a = color_error_a * color_error_a;
612 
613 		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
614 		              + color_error_g * blk.channel_weight.lane<1>()
615 		              + color_error_b * blk.channel_weight.lane<2>()
616 		              + color_error_a * blk.channel_weight.lane<3>();
617 
618 		// Mask off bad lanes
619 		vmask mask = lane_id < vint(texel_count);
620 		lane_id += vint(ASTCENC_SIMD_WIDTH);
621 		haccumulate(summav, metric, mask);
622 	}
623 
624 	return hadd_s(summav);
625 }
626 
627 #endif
628