1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions to calculate variance per component in a NxN footprint.
22 *
23 * We need N to be parametric, so the routine below uses summed area tables in order to execute in
24 * O(1) time independent of how big N is.
25 *
26 * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
27 * perform a binary reduction, and then distributes the results. This method means that there is no
28 * serial dependency between a given element and the next one, and also significantly improves
29 * numerical stability allowing us to use floats rather than doubles.
30 */
31
32 #include "astcenc_internal.h"
33
34 #include <cassert>
35
36 /**
37 * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
38 *
39 * This will take an input array of the form:
40 * v0, v1, v2, ...
41 * ... and modify in-place to turn it into a prefix-sum array of the form:
42 * v0, v0+v1, v0+v1+v2, ...
43 *
44 * @param d The array to prefix-sum.
45 * @param items The number of items in the array.
46 * @param stride The item spacing in the array; i.e. dense arrays should use 1.
47 */
brent_kung_prefix_sum(vfloat4 * d,size_t items,int stride)48 static void brent_kung_prefix_sum(
49 vfloat4* d,
50 size_t items,
51 int stride
52 ) {
53 if (items < 2)
54 return;
55
56 size_t lc_stride = 2;
57 size_t log2_stride = 1;
58
59 // The reduction-tree loop
60 do {
61 size_t step = lc_stride >> 1;
62 size_t start = lc_stride - 1;
63 size_t iters = items >> log2_stride;
64
65 vfloat4 *da = d + (start * stride);
66 ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
67 size_t ofs_stride = stride << log2_stride;
68
69 while (iters)
70 {
71 *da = *da + da[ofs];
72 da += ofs_stride;
73 iters--;
74 }
75
76 log2_stride += 1;
77 lc_stride <<= 1;
78 } while (lc_stride <= items);
79
80 // The expansion-tree loop
81 do {
82 log2_stride -= 1;
83 lc_stride >>= 1;
84
85 size_t step = lc_stride >> 1;
86 size_t start = step + lc_stride - 1;
87 size_t iters = (items - step) >> log2_stride;
88
89 vfloat4 *da = d + (start * stride);
90 ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
91 size_t ofs_stride = stride << log2_stride;
92
93 while (iters)
94 {
95 *da = *da + da[ofs];
96 da += ofs_stride;
97 iters--;
98 }
99 } while (lc_stride > 2);
100 }
101
102 /* See header for documentation. */
compute_pixel_region_variance(astcenc_contexti & ctx,const pixel_region_args & arg)103 void compute_pixel_region_variance(
104 astcenc_contexti& ctx,
105 const pixel_region_args& arg
106 ) {
107 // Unpack the memory structure into local variables
108 const astcenc_image* img = arg.img;
109 astcenc_swizzle swz = arg.swz;
110 bool have_z = arg.have_z;
111
112 int size_x = arg.size_x;
113 int size_y = arg.size_y;
114 int size_z = arg.size_z;
115
116 int offset_x = arg.offset_x;
117 int offset_y = arg.offset_y;
118 int offset_z = arg.offset_z;
119
120 int alpha_kernel_radius = arg.alpha_kernel_radius;
121
122 float* input_alpha_averages = ctx.input_alpha_averages;
123 vfloat4* work_memory = arg.work_memory;
124
125 // Compute memory sizes and dimensions that we need
126 int kernel_radius = alpha_kernel_radius;
127 int kerneldim = 2 * kernel_radius + 1;
128 int kernel_radius_xy = kernel_radius;
129 int kernel_radius_z = have_z ? kernel_radius : 0;
130
131 int padsize_x = size_x + kerneldim;
132 int padsize_y = size_y + kerneldim;
133 int padsize_z = size_z + (have_z ? kerneldim : 0);
134 int sizeprod = padsize_x * padsize_y * padsize_z;
135
136 int zd_start = have_z ? 1 : 0;
137
138 vfloat4 *varbuf1 = work_memory;
139 vfloat4 *varbuf2 = work_memory + sizeprod;
140
141 // Scaling factors to apply to Y and Z for accesses into the work buffers
142 int yst = padsize_x;
143 int zst = padsize_x * padsize_y;
144
145 // Scaling factors to apply to Y and Z for accesses into result buffers
146 int ydt = img->dim_x;
147 int zdt = img->dim_x * img->dim_y;
148
149 // Macros to act as accessor functions for the work-memory
150 #define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
151 #define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
152
153 // Load N and N^2 values into the work buffers
154 if (img->data_type == ASTCENC_TYPE_U8)
155 {
156 // Swizzle data structure 4 = ZERO, 5 = ONE
157 uint8_t data[6];
158 data[ASTCENC_SWZ_0] = 0;
159 data[ASTCENC_SWZ_1] = 255;
160
161 for (int z = zd_start; z < padsize_z; z++)
162 {
163 int z_src = (z - zd_start) + offset_z - kernel_radius_z;
164 z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
165 uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
166
167 for (int y = 1; y < padsize_y; y++)
168 {
169 int y_src = (y - 1) + offset_y - kernel_radius_xy;
170 y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
171
172 for (int x = 1; x < padsize_x; x++)
173 {
174 int x_src = (x - 1) + offset_x - kernel_radius_xy;
175 x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
176
177 data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )];
178 data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
179 data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
180 data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
181
182 uint8_t r = data[swz.r];
183 uint8_t g = data[swz.g];
184 uint8_t b = data[swz.b];
185 uint8_t a = data[swz.a];
186
187 vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
188 g * (1.0f / 255.0f),
189 b * (1.0f / 255.0f),
190 a * (1.0f / 255.0f));
191
192 VARBUF1(z, y, x) = d;
193 VARBUF2(z, y, x) = d * d;
194 }
195 }
196 }
197 }
198 else if (img->data_type == ASTCENC_TYPE_F16)
199 {
200 // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
201 uint16_t data[6];
202 data[ASTCENC_SWZ_0] = 0;
203 data[ASTCENC_SWZ_1] = 0x3C00;
204
205 for (int z = zd_start; z < padsize_z; z++)
206 {
207 int z_src = (z - zd_start) + offset_z - kernel_radius_z;
208 z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
209 uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
210
211 for (int y = 1; y < padsize_y; y++)
212 {
213 int y_src = (y - 1) + offset_y - kernel_radius_xy;
214 y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
215
216 for (int x = 1; x < padsize_x; x++)
217 {
218 int x_src = (x - 1) + offset_x - kernel_radius_xy;
219 x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
220
221 data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];
222 data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
223 data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
224 data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
225
226 vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
227 vfloat4 d = float16_to_float(di);
228
229 VARBUF1(z, y, x) = d;
230 VARBUF2(z, y, x) = d * d;
231 }
232 }
233 }
234 }
235 else // if (img->data_type == ASTCENC_TYPE_F32)
236 {
237 assert(img->data_type == ASTCENC_TYPE_F32);
238
239 // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
240 float data[6];
241 data[ASTCENC_SWZ_0] = 0.0f;
242 data[ASTCENC_SWZ_1] = 1.0f;
243
244 for (int z = zd_start; z < padsize_z; z++)
245 {
246 int z_src = (z - zd_start) + offset_z - kernel_radius_z;
247 z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
248 float* data32 = static_cast<float*>(img->data[z_src]);
249
250 for (int y = 1; y < padsize_y; y++)
251 {
252 int y_src = (y - 1) + offset_y - kernel_radius_xy;
253 y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
254
255 for (int x = 1; x < padsize_x; x++)
256 {
257 int x_src = (x - 1) + offset_x - kernel_radius_xy;
258 x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
259
260 data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];
261 data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
262 data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
263 data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
264
265 float r = data[swz.r];
266 float g = data[swz.g];
267 float b = data[swz.b];
268 float a = data[swz.a];
269
270 vfloat4 d(r, g, b, a);
271
272 VARBUF1(z, y, x) = d;
273 VARBUF2(z, y, x) = d * d;
274 }
275 }
276 }
277 }
278
279 // Pad with an extra layer of 0s; this forms the edge of the SAT tables
280 vfloat4 vbz = vfloat4::zero();
281 for (int z = 0; z < padsize_z; z++)
282 {
283 for (int y = 0; y < padsize_y; y++)
284 {
285 VARBUF1(z, y, 0) = vbz;
286 VARBUF2(z, y, 0) = vbz;
287 }
288
289 for (int x = 0; x < padsize_x; x++)
290 {
291 VARBUF1(z, 0, x) = vbz;
292 VARBUF2(z, 0, x) = vbz;
293 }
294 }
295
296 if (have_z)
297 {
298 for (int y = 0; y < padsize_y; y++)
299 {
300 for (int x = 0; x < padsize_x; x++)
301 {
302 VARBUF1(0, y, x) = vbz;
303 VARBUF2(0, y, x) = vbz;
304 }
305 }
306 }
307
308 // Generate summed-area tables for N and N^2; this is done in-place, using
309 // a Brent-Kung parallel-prefix based algorithm to minimize precision loss
310 for (int z = zd_start; z < padsize_z; z++)
311 {
312 for (int y = 1; y < padsize_y; y++)
313 {
314 brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
315 brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
316 }
317 }
318
319 for (int z = zd_start; z < padsize_z; z++)
320 {
321 for (int x = 1; x < padsize_x; x++)
322 {
323 brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
324 brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
325 }
326 }
327
328 if (have_z)
329 {
330 for (int y = 1; y < padsize_y; y++)
331 {
332 for (int x = 1; x < padsize_x; x++)
333 {
334 brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
335 brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
336 }
337 }
338 }
339
340 // Compute a few constants used in the variance-calculation.
341 float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
342 float alpha_rsamples;
343
344 if (have_z)
345 {
346 alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
347 }
348 else
349 {
350 alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
351 }
352
353 // Use the summed-area tables to compute variance for each neighborhood
354 if (have_z)
355 {
356 for (int z = 0; z < size_z; z++)
357 {
358 int z_src = z + kernel_radius_z;
359 int z_dst = z + offset_z;
360 int z_low = z_src - alpha_kernel_radius;
361 int z_high = z_src + alpha_kernel_radius + 1;
362
363 for (int y = 0; y < size_y; y++)
364 {
365 int y_src = y + kernel_radius_xy;
366 int y_dst = y + offset_y;
367 int y_low = y_src - alpha_kernel_radius;
368 int y_high = y_src + alpha_kernel_radius + 1;
369
370 for (int x = 0; x < size_x; x++)
371 {
372 int x_src = x + kernel_radius_xy;
373 int x_dst = x + offset_x;
374 int x_low = x_src - alpha_kernel_radius;
375 int x_high = x_src + alpha_kernel_radius + 1;
376
377 // Summed-area table lookups for alpha average
378 float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()
379 - VARBUF1(z_high, y_low, x_high).lane<3>()
380 - VARBUF1(z_high, y_high, x_low).lane<3>()
381 + VARBUF1(z_high, y_high, x_high).lane<3>()) -
382 ( VARBUF1(z_low, y_low, x_low).lane<3>()
383 - VARBUF1(z_low, y_low, x_high).lane<3>()
384 - VARBUF1(z_low, y_high, x_low).lane<3>()
385 + VARBUF1(z_low, y_high, x_high).lane<3>());
386
387 int out_index = z_dst * zdt + y_dst * ydt + x_dst;
388 input_alpha_averages[out_index] = (vasum * alpha_rsamples);
389 }
390 }
391 }
392 }
393 else
394 {
395 for (int y = 0; y < size_y; y++)
396 {
397 int y_src = y + kernel_radius_xy;
398 int y_dst = y + offset_y;
399 int y_low = y_src - alpha_kernel_radius;
400 int y_high = y_src + alpha_kernel_radius + 1;
401
402 for (int x = 0; x < size_x; x++)
403 {
404 int x_src = x + kernel_radius_xy;
405 int x_dst = x + offset_x;
406 int x_low = x_src - alpha_kernel_radius;
407 int x_high = x_src + alpha_kernel_radius + 1;
408
409 // Summed-area table lookups for alpha average
410 float vasum = VARBUF1(0, y_low, x_low).lane<3>()
411 - VARBUF1(0, y_low, x_high).lane<3>()
412 - VARBUF1(0, y_high, x_low).lane<3>()
413 + VARBUF1(0, y_high, x_high).lane<3>();
414
415 int out_index = y_dst * ydt + x_dst;
416 input_alpha_averages[out_index] = (vasum * alpha_rsamples);
417 }
418 }
419 }
420 }
421
422 /* See header for documentation. */
init_compute_averages(const astcenc_image & img,unsigned int alpha_kernel_radius,const astcenc_swizzle & swz,avg_args & ag)423 unsigned int init_compute_averages(
424 const astcenc_image& img,
425 unsigned int alpha_kernel_radius,
426 const astcenc_swizzle& swz,
427 avg_args& ag
428 ) {
429 unsigned int size_x = img.dim_x;
430 unsigned int size_y = img.dim_y;
431 unsigned int size_z = img.dim_z;
432
433 // Compute maximum block size and from that the working memory buffer size
434 unsigned int kernel_radius = alpha_kernel_radius;
435 unsigned int kerneldim = 2 * kernel_radius + 1;
436
437 bool have_z = (size_z > 1);
438 unsigned int max_blk_size_xy = have_z ? 16 : 32;
439 unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
440
441 unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
442 unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
443
444 // Perform block-wise averages calculations across the image
445 // Initialize fields which are not populated until later
446 ag.arg.size_x = 0;
447 ag.arg.size_y = 0;
448 ag.arg.size_z = 0;
449 ag.arg.offset_x = 0;
450 ag.arg.offset_y = 0;
451 ag.arg.offset_z = 0;
452 ag.arg.work_memory = nullptr;
453
454 ag.arg.img = &img;
455 ag.arg.swz = swz;
456 ag.arg.have_z = have_z;
457 ag.arg.alpha_kernel_radius = alpha_kernel_radius;
458
459 ag.img_size_x = size_x;
460 ag.img_size_y = size_y;
461 ag.img_size_z = size_z;
462 ag.blk_size_xy = max_blk_size_xy;
463 ag.blk_size_z = max_blk_size_z;
464 ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
465
466 // The parallel task count
467 unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
468 unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
469 return z_tasks * y_tasks;
470 }
471
472 #endif
473