1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Functions for creating in-memory ASTC image structures.
20  */
21 
22 #include <cassert>
23 #include <cstring>
24 
25 #include "astcenc_internal.h"
26 
27 /**
28  * @brief Loader pipeline function type for data fetch from memory.
29  */
30 using pixel_loader = vfloat4(*)(const void*, int);
31 
32 /**
33  * @brief Loader pipeline function type for swizzling data in a vector.
34  */
35 using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
36 
37 /**
38  * @brief Loader pipeline function type for converting data in a vector to LNS.
39  */
40 using pixel_converter = vfloat4(*)(vfloat4, vmask4);
41 
42 /**
43  * @brief Load a 8-bit UNORM texel from a data array.
44  *
45  * @param data          The data pointer.
46  * @param base_offset   The index offset to the start of the pixel.
47  */
load_texel_u8(const void * data,int base_offset)48 static vfloat4 load_texel_u8(
49 	const void* data,
50 	int base_offset
51 ) {
52 	const uint8_t* data8 = static_cast<const uint8_t*>(data);
53 	return int_to_float(vint4(data8 + base_offset)) / 255.0f;
54 }
55 
56 /**
57  * @brief Load a 16-bit fp16 texel from a data array.
58  *
59  * @param data          The data pointer.
60  * @param base_offset   The index offset to the start of the pixel.
61  */
load_texel_f16(const void * data,int base_offset)62 static vfloat4 load_texel_f16(
63 	const void* data,
64 	int base_offset
65 ) {
66 	const uint16_t* data16 = static_cast<const uint16_t*>(data);
67 	int r = data16[base_offset    ];
68 	int g = data16[base_offset + 1];
69 	int b = data16[base_offset + 2];
70 	int a = data16[base_offset + 3];
71 	return float16_to_float(vint4(r, g, b, a));
72 }
73 
74 /**
75  * @brief Load a 32-bit float texel from a data array.
76  *
77  * @param data          The data pointer.
78  * @param base_offset   The index offset to the start of the pixel.
79  */
load_texel_f32(const void * data,int base_offset)80 static vfloat4 load_texel_f32(
81 	const void* data,
82 	int base_offset
83 ) {
84 	const float* data32 = static_cast<const float*>(data);
85 	return vfloat4(data32 + base_offset);
86 }
87 
88 /**
89  * @brief Dummy no-op swizzle function.
90  *
91  * @param data   The source RGBA vector to swizzle.
92  * @param swz    The swizzle to use.
93  */
swz_texel_skip(vfloat4 data,const astcenc_swizzle & swz)94 static vfloat4 swz_texel_skip(
95 	vfloat4 data,
96 	const astcenc_swizzle& swz
97 ) {
98 	(void)swz;
99 	return data;
100 }
101 
102 /**
103  * @brief Swizzle a texel into a new arrangement.
104  *
105  * @param data   The source RGBA vector to swizzle.
106  * @param swz    The swizzle to use.
107  */
swz_texel(vfloat4 data,const astcenc_swizzle & swz)108 static vfloat4 swz_texel(
109 	vfloat4 data,
110 	const astcenc_swizzle& swz
111 ) {
112 	alignas(16) float datas[6];
113 
114 	storea(data, datas);
115 	datas[ASTCENC_SWZ_0] = 0.0f;
116 	datas[ASTCENC_SWZ_1] = 1.0f;
117 
118 	return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
119 }
120 
121 /**
122  * @brief Encode a texel that is entirely LDR linear.
123  *
124  * @param data       The RGBA data to encode.
125  * @param lns_mask   The mask for the HDR channels than need LNS encoding.
126  */
encode_texel_unorm(vfloat4 data,vmask4 lns_mask)127 static vfloat4 encode_texel_unorm(
128 	vfloat4 data,
129 	vmask4 lns_mask
130 ) {
131 	(void)lns_mask;
132 	return data * 65535.0f;
133 }
134 
135 /**
136  * @brief Encode a texel that includes at least some HDR LNS texels.
137  *
138  * @param data       The RGBA data to encode.
139  * @param lns_mask   The mask for the HDR channels than need LNS encoding.
140  */
encode_texel_lns(vfloat4 data,vmask4 lns_mask)141 static vfloat4 encode_texel_lns(
142 	vfloat4 data,
143 	vmask4 lns_mask
144 ) {
145 	vfloat4 datav_unorm = data * 65535.0f;
146 	vfloat4 datav_lns = float_to_lns(data);
147 	return select(datav_unorm, datav_lns, lns_mask);
148 }
149 
150 /* See header for documentation. */
load_image_block(astcenc_profile decode_mode,const astcenc_image & img,image_block & blk,const block_size_descriptor & bsd,unsigned int xpos,unsigned int ypos,unsigned int zpos,const astcenc_swizzle & swz)151 void load_image_block(
152 	astcenc_profile decode_mode,
153 	const astcenc_image& img,
154 	image_block& blk,
155 	const block_size_descriptor& bsd,
156 	unsigned int xpos,
157 	unsigned int ypos,
158 	unsigned int zpos,
159 	const astcenc_swizzle& swz
160 ) {
161 	unsigned int xsize = img.dim_x;
162 	unsigned int ysize = img.dim_y;
163 	unsigned int zsize = img.dim_z;
164 
165 	blk.xpos = xpos;
166 	blk.ypos = ypos;
167 	blk.zpos = zpos;
168 
169 	// True if any non-identity swizzle
170 	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
171 	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
172 
173 	int idx = 0;
174 
175 	vfloat4 data_min(1e38f);
176 	vfloat4 data_mean(0.0f);
177 	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
178 	vfloat4 data_max(-1e38f);
179 	vmask4 grayscalev(true);
180 
181 	// This works because we impose the same choice everywhere during encode
182 	uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
183 	                  (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
184 	uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
185 	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
186 	vmask4 lns_mask = use_lns != vint4::zero();
187 
188 	// Set up the function pointers for loading pipeline as needed
189 	pixel_loader loader = load_texel_u8;
190 	if (img.data_type == ASTCENC_TYPE_F16)
191 	{
192 		loader = load_texel_f16;
193 	}
194 	else if  (img.data_type == ASTCENC_TYPE_F32)
195 	{
196 		loader = load_texel_f32;
197 	}
198 
199 	pixel_swizzler swizzler = swz_texel_skip;
200 	if (needs_swz)
201 	{
202 		swizzler = swz_texel;
203 	}
204 
205 	pixel_converter converter = encode_texel_unorm;
206 	if (any(lns_mask))
207 	{
208 		converter = encode_texel_lns;
209 	}
210 
211 	for (unsigned int z = 0; z < bsd.zdim; z++)
212 	{
213 		unsigned int zi = astc::min(zpos + z, zsize - 1);
214 		void* plane = img.data[zi];
215 
216 		for (unsigned int y = 0; y < bsd.ydim; y++)
217 		{
218 			unsigned int yi = astc::min(ypos + y, ysize - 1);
219 
220 			for (unsigned int x = 0; x < bsd.xdim; x++)
221 			{
222 				unsigned int xi = astc::min(xpos + x, xsize - 1);
223 
224 				vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
225 				datav = swizzler(datav, swz);
226 				datav = converter(datav, lns_mask);
227 
228 				// Compute block metadata
229 				data_min = min(data_min, datav);
230 				data_mean += datav * data_mean_scale;
231 				data_max = max(data_max, datav);
232 
233 				grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
234 
235 				blk.data_r[idx] = datav.lane<0>();
236 				blk.data_g[idx] = datav.lane<1>();
237 				blk.data_b[idx] = datav.lane<2>();
238 				blk.data_a[idx] = datav.lane<3>();
239 
240 				blk.rgb_lns[idx] = rgb_lns;
241 				blk.alpha_lns[idx] = a_lns;
242 
243 				idx++;
244 			}
245 		}
246 	}
247 
248 	// Reverse the encoding so we store origin block in the original format
249 	vfloat4 data_enc = blk.texel(0);
250 	vfloat4 data_enc_unorm = data_enc / 65535.0f;
251 	vfloat4 data_enc_lns = vfloat4::zero();
252 
253 	if (rgb_lns || a_lns)
254 	{
255 		data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
256 	}
257 
258 	blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
259 
260 	// Store block metadata
261 	blk.data_min = data_min;
262 	blk.data_mean = data_mean;
263 	blk.data_max = data_max;
264 	blk.grayscale = all(grayscalev);
265 }
266 
267 /* See header for documentation. */
load_image_block_fast_ldr(astcenc_profile decode_mode,const astcenc_image & img,image_block & blk,const block_size_descriptor & bsd,unsigned int xpos,unsigned int ypos,unsigned int zpos,const astcenc_swizzle & swz)268 void load_image_block_fast_ldr(
269 	astcenc_profile decode_mode,
270 	const astcenc_image& img,
271 	image_block& blk,
272 	const block_size_descriptor& bsd,
273 	unsigned int xpos,
274 	unsigned int ypos,
275 	unsigned int zpos,
276 	const astcenc_swizzle& swz
277 ) {
278 	(void)swz;
279 	(void)decode_mode;
280 
281 	unsigned int xsize = img.dim_x;
282 	unsigned int ysize = img.dim_y;
283 
284 	blk.xpos = xpos;
285 	blk.ypos = ypos;
286 	blk.zpos = zpos;
287 
288 	vfloat4 data_min(1e38f);
289 	vfloat4 data_mean = vfloat4::zero();
290 	vfloat4 data_max(-1e38f);
291 	vmask4 grayscalev(true);
292 	int idx = 0;
293 
294 	const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
295 	for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
296 	{
297 		unsigned int yi = astc::min(y, ysize - 1);
298 
299 		for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
300 		{
301 			unsigned int xi = astc::min(x, xsize - 1);
302 
303 			vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
304 			vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
305 
306 			// Compute block metadata
307 			data_min = min(data_min, datav);
308 			data_mean += datav;
309 			data_max = max(data_max, datav);
310 
311 			grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
312 
313 			blk.data_r[idx] = datav.lane<0>();
314 			blk.data_g[idx] = datav.lane<1>();
315 			blk.data_b[idx] = datav.lane<2>();
316 			blk.data_a[idx] = datav.lane<3>();
317 
318 			idx++;
319 		}
320 	}
321 
322 	// Reverse the encoding so we store origin block in the original format
323 	blk.origin_texel = blk.texel(0) / 65535.0f;
324 
325 	// Store block metadata
326 	blk.rgb_lns[0] = 0;
327 	blk.alpha_lns[0] = 0;
328 	blk.data_min = data_min;
329 	blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
330 	blk.data_max = data_max;
331 	blk.grayscale = all(grayscalev);
332 }
333 
334 /* See header for documentation. */
store_image_block(astcenc_image & img,const image_block & blk,const block_size_descriptor & bsd,unsigned int xpos,unsigned int ypos,unsigned int zpos,const astcenc_swizzle & swz)335 void store_image_block(
336 	astcenc_image& img,
337 	const image_block& blk,
338 	const block_size_descriptor& bsd,
339 	unsigned int xpos,
340 	unsigned int ypos,
341 	unsigned int zpos,
342 	const astcenc_swizzle& swz
343 ) {
344 	unsigned int x_size = img.dim_x;
345 	unsigned int x_start = xpos;
346 	unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
347 	unsigned int x_count = x_end - x_start;
348 	unsigned int x_nudge = bsd.xdim - x_count;
349 
350 	unsigned int y_size = img.dim_y;
351 	unsigned int y_start = ypos;
352 	unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
353 	unsigned int y_count = y_end - y_start;
354 	unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
355 
356 	unsigned int z_size = img.dim_z;
357 	unsigned int z_start = zpos;
358 	unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
359 
360 	// True if any non-identity swizzle
361 	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
362 	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
363 
364 	// True if any swizzle uses Z reconstruct
365 	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
366 	               (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
367 
368 	int idx = 0;
369 	if (img.data_type == ASTCENC_TYPE_U8)
370 	{
371 		for (unsigned int z = z_start; z < z_end; z++)
372 		{
373 			// Fetch the image plane
374 			uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
375 
376 			for (unsigned int y = y_start; y < y_end; y++)
377 			{
378 				uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
379 
380 				for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
381 				{
382 					unsigned int max_texels = ASTCENC_SIMD_WIDTH;
383 					unsigned int used_texels = astc::min(x_count - x, max_texels);
384 
385 					// Unaligned load as rows are not always SIMD_WIDTH long
386 					vfloat data_r(blk.data_r + idx);
387 					vfloat data_g(blk.data_g + idx);
388 					vfloat data_b(blk.data_b + idx);
389 					vfloat data_a(blk.data_a + idx);
390 
391 					vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
392 					vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
393 					vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
394 					vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
395 
396 					if (needs_swz)
397 					{
398 						vint swizzle_table[7];
399 						swizzle_table[ASTCENC_SWZ_0] = vint(0);
400 						swizzle_table[ASTCENC_SWZ_1] = vint(255);
401 						swizzle_table[ASTCENC_SWZ_R] = data_ri;
402 						swizzle_table[ASTCENC_SWZ_G] = data_gi;
403 						swizzle_table[ASTCENC_SWZ_B] = data_bi;
404 						swizzle_table[ASTCENC_SWZ_A] = data_ai;
405 
406 						if (needs_z)
407 						{
408 							vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
409 							vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
410 							vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
411 							data_z = max(data_z, 0.0f);
412 							data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
413 
414 							swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
415 						}
416 
417 						data_ri = swizzle_table[swz.r];
418 						data_gi = swizzle_table[swz.g];
419 						data_bi = swizzle_table[swz.b];
420 						data_ai = swizzle_table[swz.a];
421 					}
422 
423 					// Errors are NaN encoded - convert to magenta error color
424 					// Branch is OK here - it is almost never true so predicts well
425 					vmask nan_mask = data_r != data_r;
426 					if (any(nan_mask))
427 					{
428 						data_ri = select(data_ri, vint(0xFF), nan_mask);
429 						data_gi = select(data_gi, vint(0x00), nan_mask);
430 						data_bi = select(data_bi, vint(0xFF), nan_mask);
431 						data_ai = select(data_ai, vint(0xFF), nan_mask);
432 					}
433 
434 					vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
435 					vmask store_mask = vint::lane_id() < vint(used_texels);
436 					store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask);
437 
438 					data8_row += ASTCENC_SIMD_WIDTH * 4;
439 					idx += used_texels;
440 				}
441 				idx += x_nudge;
442 			}
443 			idx += y_nudge;
444 		}
445 	}
446 	else if (img.data_type == ASTCENC_TYPE_F16)
447 	{
448 		for (unsigned int z = z_start; z < z_end; z++)
449 		{
450 			// Fetch the image plane
451 			uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
452 
453 			for (unsigned int y = y_start; y < y_end; y++)
454 			{
455 				uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
456 
457 				for (unsigned int x = 0; x < x_count; x++)
458 				{
459 					vint4 color;
460 
461 					// NaNs are handled inline - no need to special case
462 					if (needs_swz)
463 					{
464 						float data[7];
465 						data[ASTCENC_SWZ_0] = 0.0f;
466 						data[ASTCENC_SWZ_1] = 1.0f;
467 						data[ASTCENC_SWZ_R] = blk.data_r[idx];
468 						data[ASTCENC_SWZ_G] = blk.data_g[idx];
469 						data[ASTCENC_SWZ_B] = blk.data_b[idx];
470 						data[ASTCENC_SWZ_A] = blk.data_a[idx];
471 
472 						if (needs_z)
473 						{
474 							float xN = (data[0] * 2.0f) - 1.0f;
475 							float yN = (data[3] * 2.0f) - 1.0f;
476 							float zN = 1.0f - xN * xN - yN * yN;
477 							if (zN < 0.0f)
478 							{
479 								zN = 0.0f;
480 							}
481 							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
482 						}
483 
484 						vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
485 						color = float_to_float16(colorf);
486 					}
487 					else
488 					{
489 						vfloat4 colorf = blk.texel(idx);
490 						color = float_to_float16(colorf);
491 					}
492 
493 					// TODO: Vectorize with store N shorts?
494 					data16_row[0] = static_cast<uint16_t>(color.lane<0>());
495 					data16_row[1] = static_cast<uint16_t>(color.lane<1>());
496 					data16_row[2] = static_cast<uint16_t>(color.lane<2>());
497 					data16_row[3] = static_cast<uint16_t>(color.lane<3>());
498 					data16_row += 4;
499 					idx++;
500 				}
501 				idx += x_nudge;
502 			}
503 			idx += y_nudge;
504 		}
505 	}
506 	else // if (img.data_type == ASTCENC_TYPE_F32)
507 	{
508 		assert(img.data_type == ASTCENC_TYPE_F32);
509 
510 		for (unsigned int z = z_start; z < z_end; z++)
511 		{
512 			// Fetch the image plane
513 			float* data32 = static_cast<float*>(img.data[z]);
514 
515 			for (unsigned int y = y_start; y < y_end; y++)
516 			{
517 				float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
518 
519 				for (unsigned int x = 0; x < x_count; x++)
520 				{
521 					vfloat4 color = blk.texel(idx);
522 
523 					// NaNs are handled inline - no need to special case
524 					if (needs_swz)
525 					{
526 						float data[7];
527 						data[ASTCENC_SWZ_0] = 0.0f;
528 						data[ASTCENC_SWZ_1] = 1.0f;
529 						data[ASTCENC_SWZ_R] = color.lane<0>();
530 						data[ASTCENC_SWZ_G] = color.lane<1>();
531 						data[ASTCENC_SWZ_B] = color.lane<2>();
532 						data[ASTCENC_SWZ_A] = color.lane<3>();
533 
534 						if (needs_z)
535 						{
536 							float xN = (data[0] * 2.0f) - 1.0f;
537 							float yN = (data[3] * 2.0f) - 1.0f;
538 							float zN = 1.0f - xN * xN - yN * yN;
539 							if (zN < 0.0f)
540 							{
541 								zN = 0.0f;
542 							}
543 							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
544 						}
545 
546 						color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
547 					}
548 
549 					store(color, data32_row);
550 					data32_row += 4;
551 					idx++;
552 				}
553 				idx += x_nudge;
554 			}
555 			idx += y_nudge;
556 		}
557 	}
558 }
559