1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief Functions for creating in-memory ASTC image structures.
20 */
21
22 #include <cassert>
23 #include <cstring>
24
25 #include "astcenc_internal.h"
26
27 /**
28 * @brief Loader pipeline function type for data fetch from memory.
29 */
30 using pixel_loader = vfloat4(*)(const void*, int);
31
32 /**
33 * @brief Loader pipeline function type for swizzling data in a vector.
34 */
35 using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
36
37 /**
38 * @brief Loader pipeline function type for converting data in a vector to LNS.
39 */
40 using pixel_converter = vfloat4(*)(vfloat4, vmask4);
41
42 /**
43 * @brief Load a 8-bit UNORM texel from a data array.
44 *
45 * @param data The data pointer.
46 * @param base_offset The index offset to the start of the pixel.
47 */
load_texel_u8(const void * data,int base_offset)48 static vfloat4 load_texel_u8(
49 const void* data,
50 int base_offset
51 ) {
52 const uint8_t* data8 = static_cast<const uint8_t*>(data);
53 return int_to_float(vint4(data8 + base_offset)) / 255.0f;
54 }
55
56 /**
57 * @brief Load a 16-bit fp16 texel from a data array.
58 *
59 * @param data The data pointer.
60 * @param base_offset The index offset to the start of the pixel.
61 */
load_texel_f16(const void * data,int base_offset)62 static vfloat4 load_texel_f16(
63 const void* data,
64 int base_offset
65 ) {
66 const uint16_t* data16 = static_cast<const uint16_t*>(data);
67 int r = data16[base_offset ];
68 int g = data16[base_offset + 1];
69 int b = data16[base_offset + 2];
70 int a = data16[base_offset + 3];
71 return float16_to_float(vint4(r, g, b, a));
72 }
73
74 /**
75 * @brief Load a 32-bit float texel from a data array.
76 *
77 * @param data The data pointer.
78 * @param base_offset The index offset to the start of the pixel.
79 */
load_texel_f32(const void * data,int base_offset)80 static vfloat4 load_texel_f32(
81 const void* data,
82 int base_offset
83 ) {
84 const float* data32 = static_cast<const float*>(data);
85 return vfloat4(data32 + base_offset);
86 }
87
88 /**
89 * @brief Dummy no-op swizzle function.
90 *
91 * @param data The source RGBA vector to swizzle.
92 * @param swz The swizzle to use.
93 */
swz_texel_skip(vfloat4 data,const astcenc_swizzle & swz)94 static vfloat4 swz_texel_skip(
95 vfloat4 data,
96 const astcenc_swizzle& swz
97 ) {
98 (void)swz;
99 return data;
100 }
101
102 /**
103 * @brief Swizzle a texel into a new arrangement.
104 *
105 * @param data The source RGBA vector to swizzle.
106 * @param swz The swizzle to use.
107 */
swz_texel(vfloat4 data,const astcenc_swizzle & swz)108 static vfloat4 swz_texel(
109 vfloat4 data,
110 const astcenc_swizzle& swz
111 ) {
112 alignas(16) float datas[6];
113
114 storea(data, datas);
115 datas[ASTCENC_SWZ_0] = 0.0f;
116 datas[ASTCENC_SWZ_1] = 1.0f;
117
118 return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
119 }
120
121 /**
122 * @brief Encode a texel that is entirely LDR linear.
123 *
124 * @param data The RGBA data to encode.
125 * @param lns_mask The mask for the HDR channels than need LNS encoding.
126 */
encode_texel_unorm(vfloat4 data,vmask4 lns_mask)127 static vfloat4 encode_texel_unorm(
128 vfloat4 data,
129 vmask4 lns_mask
130 ) {
131 (void)lns_mask;
132 return data * 65535.0f;
133 }
134
135 /**
136 * @brief Encode a texel that includes at least some HDR LNS texels.
137 *
138 * @param data The RGBA data to encode.
139 * @param lns_mask The mask for the HDR channels than need LNS encoding.
140 */
encode_texel_lns(vfloat4 data,vmask4 lns_mask)141 static vfloat4 encode_texel_lns(
142 vfloat4 data,
143 vmask4 lns_mask
144 ) {
145 vfloat4 datav_unorm = data * 65535.0f;
146 vfloat4 datav_lns = float_to_lns(data);
147 return select(datav_unorm, datav_lns, lns_mask);
148 }
149
150 /* See header for documentation. */
load_image_block(astcenc_profile decode_mode,const astcenc_image & img,image_block & blk,const block_size_descriptor & bsd,unsigned int xpos,unsigned int ypos,unsigned int zpos,const astcenc_swizzle & swz)151 void load_image_block(
152 astcenc_profile decode_mode,
153 const astcenc_image& img,
154 image_block& blk,
155 const block_size_descriptor& bsd,
156 unsigned int xpos,
157 unsigned int ypos,
158 unsigned int zpos,
159 const astcenc_swizzle& swz
160 ) {
161 unsigned int xsize = img.dim_x;
162 unsigned int ysize = img.dim_y;
163 unsigned int zsize = img.dim_z;
164
165 blk.xpos = xpos;
166 blk.ypos = ypos;
167 blk.zpos = zpos;
168
169 // True if any non-identity swizzle
170 bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
171 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
172
173 int idx = 0;
174
175 vfloat4 data_min(1e38f);
176 vfloat4 data_mean(0.0f);
177 vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
178 vfloat4 data_max(-1e38f);
179 vmask4 grayscalev(true);
180
181 // This works because we impose the same choice everywhere during encode
182 uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
183 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
184 uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
185 vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
186 vmask4 lns_mask = use_lns != vint4::zero();
187
188 // Set up the function pointers for loading pipeline as needed
189 pixel_loader loader = load_texel_u8;
190 if (img.data_type == ASTCENC_TYPE_F16)
191 {
192 loader = load_texel_f16;
193 }
194 else if (img.data_type == ASTCENC_TYPE_F32)
195 {
196 loader = load_texel_f32;
197 }
198
199 pixel_swizzler swizzler = swz_texel_skip;
200 if (needs_swz)
201 {
202 swizzler = swz_texel;
203 }
204
205 pixel_converter converter = encode_texel_unorm;
206 if (any(lns_mask))
207 {
208 converter = encode_texel_lns;
209 }
210
211 for (unsigned int z = 0; z < bsd.zdim; z++)
212 {
213 unsigned int zi = astc::min(zpos + z, zsize - 1);
214 void* plane = img.data[zi];
215
216 for (unsigned int y = 0; y < bsd.ydim; y++)
217 {
218 unsigned int yi = astc::min(ypos + y, ysize - 1);
219
220 for (unsigned int x = 0; x < bsd.xdim; x++)
221 {
222 unsigned int xi = astc::min(xpos + x, xsize - 1);
223
224 vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
225 datav = swizzler(datav, swz);
226 datav = converter(datav, lns_mask);
227
228 // Compute block metadata
229 data_min = min(data_min, datav);
230 data_mean += datav * data_mean_scale;
231 data_max = max(data_max, datav);
232
233 grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
234
235 blk.data_r[idx] = datav.lane<0>();
236 blk.data_g[idx] = datav.lane<1>();
237 blk.data_b[idx] = datav.lane<2>();
238 blk.data_a[idx] = datav.lane<3>();
239
240 blk.rgb_lns[idx] = rgb_lns;
241 blk.alpha_lns[idx] = a_lns;
242
243 idx++;
244 }
245 }
246 }
247
248 // Reverse the encoding so we store origin block in the original format
249 vfloat4 data_enc = blk.texel(0);
250 vfloat4 data_enc_unorm = data_enc / 65535.0f;
251 vfloat4 data_enc_lns = vfloat4::zero();
252
253 if (rgb_lns || a_lns)
254 {
255 data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
256 }
257
258 blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
259
260 // Store block metadata
261 blk.data_min = data_min;
262 blk.data_mean = data_mean;
263 blk.data_max = data_max;
264 blk.grayscale = all(grayscalev);
265 }
266
267 /* See header for documentation. */
load_image_block_fast_ldr(astcenc_profile decode_mode,const astcenc_image & img,image_block & blk,const block_size_descriptor & bsd,unsigned int xpos,unsigned int ypos,unsigned int zpos,const astcenc_swizzle & swz)268 void load_image_block_fast_ldr(
269 astcenc_profile decode_mode,
270 const astcenc_image& img,
271 image_block& blk,
272 const block_size_descriptor& bsd,
273 unsigned int xpos,
274 unsigned int ypos,
275 unsigned int zpos,
276 const astcenc_swizzle& swz
277 ) {
278 (void)swz;
279 (void)decode_mode;
280
281 unsigned int xsize = img.dim_x;
282 unsigned int ysize = img.dim_y;
283
284 blk.xpos = xpos;
285 blk.ypos = ypos;
286 blk.zpos = zpos;
287
288 vfloat4 data_min(1e38f);
289 vfloat4 data_mean = vfloat4::zero();
290 vfloat4 data_max(-1e38f);
291 vmask4 grayscalev(true);
292 int idx = 0;
293
294 const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
295 for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
296 {
297 unsigned int yi = astc::min(y, ysize - 1);
298
299 for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
300 {
301 unsigned int xi = astc::min(x, xsize - 1);
302
303 vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
304 vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
305
306 // Compute block metadata
307 data_min = min(data_min, datav);
308 data_mean += datav;
309 data_max = max(data_max, datav);
310
311 grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
312
313 blk.data_r[idx] = datav.lane<0>();
314 blk.data_g[idx] = datav.lane<1>();
315 blk.data_b[idx] = datav.lane<2>();
316 blk.data_a[idx] = datav.lane<3>();
317
318 idx++;
319 }
320 }
321
322 // Reverse the encoding so we store origin block in the original format
323 blk.origin_texel = blk.texel(0) / 65535.0f;
324
325 // Store block metadata
326 blk.rgb_lns[0] = 0;
327 blk.alpha_lns[0] = 0;
328 blk.data_min = data_min;
329 blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
330 blk.data_max = data_max;
331 blk.grayscale = all(grayscalev);
332 }
333
334 /* See header for documentation. */
store_image_block(astcenc_image & img,const image_block & blk,const block_size_descriptor & bsd,unsigned int xpos,unsigned int ypos,unsigned int zpos,const astcenc_swizzle & swz)335 void store_image_block(
336 astcenc_image& img,
337 const image_block& blk,
338 const block_size_descriptor& bsd,
339 unsigned int xpos,
340 unsigned int ypos,
341 unsigned int zpos,
342 const astcenc_swizzle& swz
343 ) {
344 unsigned int x_size = img.dim_x;
345 unsigned int x_start = xpos;
346 unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
347 unsigned int x_count = x_end - x_start;
348 unsigned int x_nudge = bsd.xdim - x_count;
349
350 unsigned int y_size = img.dim_y;
351 unsigned int y_start = ypos;
352 unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
353 unsigned int y_count = y_end - y_start;
354 unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
355
356 unsigned int z_size = img.dim_z;
357 unsigned int z_start = zpos;
358 unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
359
360 // True if any non-identity swizzle
361 bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
362 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
363
364 // True if any swizzle uses Z reconstruct
365 bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
366 (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
367
368 int idx = 0;
369 if (img.data_type == ASTCENC_TYPE_U8)
370 {
371 for (unsigned int z = z_start; z < z_end; z++)
372 {
373 // Fetch the image plane
374 uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
375
376 for (unsigned int y = y_start; y < y_end; y++)
377 {
378 uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
379
380 for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
381 {
382 unsigned int max_texels = ASTCENC_SIMD_WIDTH;
383 unsigned int used_texels = astc::min(x_count - x, max_texels);
384
385 // Unaligned load as rows are not always SIMD_WIDTH long
386 vfloat data_r(blk.data_r + idx);
387 vfloat data_g(blk.data_g + idx);
388 vfloat data_b(blk.data_b + idx);
389 vfloat data_a(blk.data_a + idx);
390
391 vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
392 vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
393 vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
394 vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
395
396 if (needs_swz)
397 {
398 vint swizzle_table[7];
399 swizzle_table[ASTCENC_SWZ_0] = vint(0);
400 swizzle_table[ASTCENC_SWZ_1] = vint(255);
401 swizzle_table[ASTCENC_SWZ_R] = data_ri;
402 swizzle_table[ASTCENC_SWZ_G] = data_gi;
403 swizzle_table[ASTCENC_SWZ_B] = data_bi;
404 swizzle_table[ASTCENC_SWZ_A] = data_ai;
405
406 if (needs_z)
407 {
408 vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
409 vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
410 vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
411 data_z = max(data_z, 0.0f);
412 data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
413
414 swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
415 }
416
417 data_ri = swizzle_table[swz.r];
418 data_gi = swizzle_table[swz.g];
419 data_bi = swizzle_table[swz.b];
420 data_ai = swizzle_table[swz.a];
421 }
422
423 // Errors are NaN encoded - convert to magenta error color
424 // Branch is OK here - it is almost never true so predicts well
425 vmask nan_mask = data_r != data_r;
426 if (any(nan_mask))
427 {
428 data_ri = select(data_ri, vint(0xFF), nan_mask);
429 data_gi = select(data_gi, vint(0x00), nan_mask);
430 data_bi = select(data_bi, vint(0xFF), nan_mask);
431 data_ai = select(data_ai, vint(0xFF), nan_mask);
432 }
433
434 vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
435 vmask store_mask = vint::lane_id() < vint(used_texels);
436 store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask);
437
438 data8_row += ASTCENC_SIMD_WIDTH * 4;
439 idx += used_texels;
440 }
441 idx += x_nudge;
442 }
443 idx += y_nudge;
444 }
445 }
446 else if (img.data_type == ASTCENC_TYPE_F16)
447 {
448 for (unsigned int z = z_start; z < z_end; z++)
449 {
450 // Fetch the image plane
451 uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
452
453 for (unsigned int y = y_start; y < y_end; y++)
454 {
455 uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
456
457 for (unsigned int x = 0; x < x_count; x++)
458 {
459 vint4 color;
460
461 // NaNs are handled inline - no need to special case
462 if (needs_swz)
463 {
464 float data[7];
465 data[ASTCENC_SWZ_0] = 0.0f;
466 data[ASTCENC_SWZ_1] = 1.0f;
467 data[ASTCENC_SWZ_R] = blk.data_r[idx];
468 data[ASTCENC_SWZ_G] = blk.data_g[idx];
469 data[ASTCENC_SWZ_B] = blk.data_b[idx];
470 data[ASTCENC_SWZ_A] = blk.data_a[idx];
471
472 if (needs_z)
473 {
474 float xN = (data[0] * 2.0f) - 1.0f;
475 float yN = (data[3] * 2.0f) - 1.0f;
476 float zN = 1.0f - xN * xN - yN * yN;
477 if (zN < 0.0f)
478 {
479 zN = 0.0f;
480 }
481 data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
482 }
483
484 vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
485 color = float_to_float16(colorf);
486 }
487 else
488 {
489 vfloat4 colorf = blk.texel(idx);
490 color = float_to_float16(colorf);
491 }
492
493 // TODO: Vectorize with store N shorts?
494 data16_row[0] = static_cast<uint16_t>(color.lane<0>());
495 data16_row[1] = static_cast<uint16_t>(color.lane<1>());
496 data16_row[2] = static_cast<uint16_t>(color.lane<2>());
497 data16_row[3] = static_cast<uint16_t>(color.lane<3>());
498 data16_row += 4;
499 idx++;
500 }
501 idx += x_nudge;
502 }
503 idx += y_nudge;
504 }
505 }
506 else // if (img.data_type == ASTCENC_TYPE_F32)
507 {
508 assert(img.data_type == ASTCENC_TYPE_F32);
509
510 for (unsigned int z = z_start; z < z_end; z++)
511 {
512 // Fetch the image plane
513 float* data32 = static_cast<float*>(img.data[z]);
514
515 for (unsigned int y = y_start; y < y_end; y++)
516 {
517 float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
518
519 for (unsigned int x = 0; x < x_count; x++)
520 {
521 vfloat4 color = blk.texel(idx);
522
523 // NaNs are handled inline - no need to special case
524 if (needs_swz)
525 {
526 float data[7];
527 data[ASTCENC_SWZ_0] = 0.0f;
528 data[ASTCENC_SWZ_1] = 1.0f;
529 data[ASTCENC_SWZ_R] = color.lane<0>();
530 data[ASTCENC_SWZ_G] = color.lane<1>();
531 data[ASTCENC_SWZ_B] = color.lane<2>();
532 data[ASTCENC_SWZ_A] = color.lane<3>();
533
534 if (needs_z)
535 {
536 float xN = (data[0] * 2.0f) - 1.0f;
537 float yN = (data[3] * 2.0f) - 1.0f;
538 float zN = 1.0f - xN * xN - yN * yN;
539 if (zN < 0.0f)
540 {
541 zN = 0.0f;
542 }
543 data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
544 }
545
546 color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
547 }
548
549 store(color, data32_row);
550 data32_row += 4;
551 idx++;
552 }
553 idx += x_nudge;
554 }
555 idx += y_nudge;
556 }
557 }
558 }
559