1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19 
20 /**
21  * @brief Functions to calculate variance per component in a NxN footprint.
22  *
23  * We need N to be parametric, so the routine below uses summed area tables in order to execute in
24  * O(1) time independent of how big N is.
25  *
26  * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
27  * perform a binary reduction, and then distributes the results. This method means that there is no
28  * serial dependency between a given element and the next one, and also significantly improves
29  * numerical stability allowing us to use floats rather than doubles.
30  */
31 
32 #include "astcenc_internal.h"
33 
34 #include <cassert>
35 
36 /**
37  * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
38  *
39  * This will take an input array of the form:
40  *     v0, v1, v2, ...
41  * ... and modify in-place to turn it into a prefix-sum array of the form:
42  *     v0, v0+v1, v0+v1+v2, ...
43  *
44  * @param d      The array to prefix-sum.
45  * @param items  The number of items in the array.
46  * @param stride The item spacing in the array; i.e. dense arrays should use 1.
47  */
brent_kung_prefix_sum(vfloat4 * d,size_t items,int stride)48 static void brent_kung_prefix_sum(
49 	vfloat4* d,
50 	size_t items,
51 	int stride
52 ) {
53 	if (items < 2)
54 		return;
55 
56 	size_t lc_stride = 2;
57 	size_t log2_stride = 1;
58 
59 	// The reduction-tree loop
60 	do {
61 		size_t step = lc_stride >> 1;
62 		size_t start = lc_stride - 1;
63 		size_t iters = items >> log2_stride;
64 
65 		vfloat4 *da = d + (start * stride);
66 		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
67 		size_t ofs_stride = stride << log2_stride;
68 
69 		while (iters)
70 		{
71 			*da = *da + da[ofs];
72 			da += ofs_stride;
73 			iters--;
74 		}
75 
76 		log2_stride += 1;
77 		lc_stride <<= 1;
78 	} while (lc_stride <= items);
79 
80 	// The expansion-tree loop
81 	do {
82 		log2_stride -= 1;
83 		lc_stride >>= 1;
84 
85 		size_t step = lc_stride >> 1;
86 		size_t start = step + lc_stride - 1;
87 		size_t iters = (items - step) >> log2_stride;
88 
89 		vfloat4 *da = d + (start * stride);
90 		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
91 		size_t ofs_stride = stride << log2_stride;
92 
93 		while (iters)
94 		{
95 			*da = *da + da[ofs];
96 			da += ofs_stride;
97 			iters--;
98 		}
99 	} while (lc_stride > 2);
100 }
101 
102 /* See header for documentation. */
compute_pixel_region_variance(astcenc_contexti & ctx,const pixel_region_args & arg)103 void compute_pixel_region_variance(
104 	astcenc_contexti& ctx,
105 	const pixel_region_args& arg
106 ) {
107 	// Unpack the memory structure into local variables
108 	const astcenc_image* img = arg.img;
109 	astcenc_swizzle swz = arg.swz;
110 	bool have_z = arg.have_z;
111 
112 	int size_x = arg.size_x;
113 	int size_y = arg.size_y;
114 	int size_z = arg.size_z;
115 
116 	int offset_x = arg.offset_x;
117 	int offset_y = arg.offset_y;
118 	int offset_z = arg.offset_z;
119 
120 	int alpha_kernel_radius = arg.alpha_kernel_radius;
121 
122 	float*   input_alpha_averages = ctx.input_alpha_averages;
123 	vfloat4* work_memory = arg.work_memory;
124 
125 	// Compute memory sizes and dimensions that we need
126 	int kernel_radius = alpha_kernel_radius;
127 	int kerneldim = 2 * kernel_radius + 1;
128 	int kernel_radius_xy = kernel_radius;
129 	int kernel_radius_z = have_z ? kernel_radius : 0;
130 
131 	int padsize_x = size_x + kerneldim;
132 	int padsize_y = size_y + kerneldim;
133 	int padsize_z = size_z + (have_z ? kerneldim : 0);
134 	int sizeprod = padsize_x * padsize_y * padsize_z;
135 
136 	int zd_start = have_z ? 1 : 0;
137 
138 	vfloat4 *varbuf1 = work_memory;
139 	vfloat4 *varbuf2 = work_memory + sizeprod;
140 
141 	// Scaling factors to apply to Y and Z for accesses into the work buffers
142 	int yst = padsize_x;
143 	int zst = padsize_x * padsize_y;
144 
145 	// Scaling factors to apply to Y and Z for accesses into result buffers
146 	int ydt = img->dim_x;
147 	int zdt = img->dim_x * img->dim_y;
148 
149 	// Macros to act as accessor functions for the work-memory
150 	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
151 	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
152 
153 	// Load N and N^2 values into the work buffers
154 	if (img->data_type == ASTCENC_TYPE_U8)
155 	{
156 		// Swizzle data structure 4 = ZERO, 5 = ONE
157 		uint8_t data[6];
158 		data[ASTCENC_SWZ_0] = 0;
159 		data[ASTCENC_SWZ_1] = 255;
160 
161 		for (int z = zd_start; z < padsize_z; z++)
162 		{
163 			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
164 			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
165 			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
166 
167 			for (int y = 1; y < padsize_y; y++)
168 			{
169 				int y_src = (y - 1) + offset_y - kernel_radius_xy;
170 				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
171 
172 				for (int x = 1; x < padsize_x; x++)
173 				{
174 					int x_src = (x - 1) + offset_x - kernel_radius_xy;
175 					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
176 
177 					data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src    )];
178 					data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
179 					data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
180 					data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
181 
182 					uint8_t r = data[swz.r];
183 					uint8_t g = data[swz.g];
184 					uint8_t b = data[swz.b];
185 					uint8_t a = data[swz.a];
186 
187 					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
188 					                     g * (1.0f / 255.0f),
189 					                     b * (1.0f / 255.0f),
190 					                     a * (1.0f / 255.0f));
191 
192 					VARBUF1(z, y, x) = d;
193 					VARBUF2(z, y, x) = d * d;
194 				}
195 			}
196 		}
197 	}
198 	else if (img->data_type == ASTCENC_TYPE_F16)
199 	{
200 		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
201 		uint16_t data[6];
202 		data[ASTCENC_SWZ_0] = 0;
203 		data[ASTCENC_SWZ_1] = 0x3C00;
204 
205 		for (int z = zd_start; z < padsize_z; z++)
206 		{
207 			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
208 			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
209 			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
210 
211 			for (int y = 1; y < padsize_y; y++)
212 			{
213 				int y_src = (y - 1) + offset_y - kernel_radius_xy;
214 				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
215 
216 				for (int x = 1; x < padsize_x; x++)
217 				{
218 					int x_src = (x - 1) + offset_x - kernel_radius_xy;
219 					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
220 
221 					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
222 					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
223 					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
224 					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
225 
226 					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
227 					vfloat4 d = float16_to_float(di);
228 
229 					VARBUF1(z, y, x) = d;
230 					VARBUF2(z, y, x) = d * d;
231 				}
232 			}
233 		}
234 	}
235 	else // if (img->data_type == ASTCENC_TYPE_F32)
236 	{
237 		assert(img->data_type == ASTCENC_TYPE_F32);
238 
239 		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
240 		float data[6];
241 		data[ASTCENC_SWZ_0] = 0.0f;
242 		data[ASTCENC_SWZ_1] = 1.0f;
243 
244 		for (int z = zd_start; z < padsize_z; z++)
245 		{
246 			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
247 			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
248 			float* data32 = static_cast<float*>(img->data[z_src]);
249 
250 			for (int y = 1; y < padsize_y; y++)
251 			{
252 				int y_src = (y - 1) + offset_y - kernel_radius_xy;
253 				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
254 
255 				for (int x = 1; x < padsize_x; x++)
256 				{
257 					int x_src = (x - 1) + offset_x - kernel_radius_xy;
258 					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
259 
260 					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
261 					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
262 					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
263 					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
264 
265 					float r = data[swz.r];
266 					float g = data[swz.g];
267 					float b = data[swz.b];
268 					float a = data[swz.a];
269 
270 					vfloat4 d(r, g, b, a);
271 
272 					VARBUF1(z, y, x) = d;
273 					VARBUF2(z, y, x) = d * d;
274 				}
275 			}
276 		}
277 	}
278 
279 	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
280 	vfloat4 vbz = vfloat4::zero();
281 	for (int z = 0; z < padsize_z; z++)
282 	{
283 		for (int y = 0; y < padsize_y; y++)
284 		{
285 			VARBUF1(z, y, 0) = vbz;
286 			VARBUF2(z, y, 0) = vbz;
287 		}
288 
289 		for (int x = 0; x < padsize_x; x++)
290 		{
291 			VARBUF1(z, 0, x) = vbz;
292 			VARBUF2(z, 0, x) = vbz;
293 		}
294 	}
295 
296 	if (have_z)
297 	{
298 		for (int y = 0; y < padsize_y; y++)
299 		{
300 			for (int x = 0; x < padsize_x; x++)
301 			{
302 				VARBUF1(0, y, x) = vbz;
303 				VARBUF2(0, y, x) = vbz;
304 			}
305 		}
306 	}
307 
308 	// Generate summed-area tables for N and N^2; this is done in-place, using
309 	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
310 	for (int z = zd_start; z < padsize_z; z++)
311 	{
312 		for (int y = 1; y < padsize_y; y++)
313 		{
314 			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
315 			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
316 		}
317 	}
318 
319 	for (int z = zd_start; z < padsize_z; z++)
320 	{
321 		for (int x = 1; x < padsize_x; x++)
322 		{
323 			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
324 			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
325 		}
326 	}
327 
328 	if (have_z)
329 	{
330 		for (int y = 1; y < padsize_y; y++)
331 		{
332 			for (int x = 1; x < padsize_x; x++)
333 			{
334 				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
335 				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
336 			}
337 		}
338 	}
339 
340 	// Compute a few constants used in the variance-calculation.
341 	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
342 	float alpha_rsamples;
343 
344 	if (have_z)
345 	{
346 		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
347 	}
348 	else
349 	{
350 		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
351 	}
352 
353 	// Use the summed-area tables to compute variance for each neighborhood
354 	if (have_z)
355 	{
356 		for (int z = 0; z < size_z; z++)
357 		{
358 			int z_src = z + kernel_radius_z;
359 			int z_dst = z + offset_z;
360 			int z_low  = z_src - alpha_kernel_radius;
361 			int z_high = z_src + alpha_kernel_radius + 1;
362 
363 			for (int y = 0; y < size_y; y++)
364 			{
365 				int y_src = y + kernel_radius_xy;
366 				int y_dst = y + offset_y;
367 				int y_low  = y_src - alpha_kernel_radius;
368 				int y_high = y_src + alpha_kernel_radius + 1;
369 
370 				for (int x = 0; x < size_x; x++)
371 				{
372 					int x_src = x + kernel_radius_xy;
373 					int x_dst = x + offset_x;
374 					int x_low  = x_src - alpha_kernel_radius;
375 					int x_high = x_src + alpha_kernel_radius + 1;
376 
377 					// Summed-area table lookups for alpha average
378 					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
379 					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
380 					               - VARBUF1(z_high, y_high, x_low).lane<3>()
381 					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
382 					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
383 					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
384 					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
385 					               + VARBUF1(z_low,  y_high, x_high).lane<3>());
386 
387 					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
388 					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
389 				}
390 			}
391 		}
392 	}
393 	else
394 	{
395 		for (int y = 0; y < size_y; y++)
396 		{
397 			int y_src = y + kernel_radius_xy;
398 			int y_dst = y + offset_y;
399 			int y_low  = y_src - alpha_kernel_radius;
400 			int y_high = y_src + alpha_kernel_radius + 1;
401 
402 			for (int x = 0; x < size_x; x++)
403 			{
404 				int x_src = x + kernel_radius_xy;
405 				int x_dst = x + offset_x;
406 				int x_low  = x_src - alpha_kernel_radius;
407 				int x_high = x_src + alpha_kernel_radius + 1;
408 
409 				// Summed-area table lookups for alpha average
410 				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
411 				            - VARBUF1(0, y_low,  x_high).lane<3>()
412 				            - VARBUF1(0, y_high, x_low).lane<3>()
413 				            + VARBUF1(0, y_high, x_high).lane<3>();
414 
415 				int out_index = y_dst * ydt + x_dst;
416 				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
417 			}
418 		}
419 	}
420 }
421 
422 /* See header for documentation. */
init_compute_averages(const astcenc_image & img,unsigned int alpha_kernel_radius,const astcenc_swizzle & swz,avg_args & ag)423 unsigned int init_compute_averages(
424 	const astcenc_image& img,
425 	unsigned int alpha_kernel_radius,
426 	const astcenc_swizzle& swz,
427 	avg_args& ag
428 ) {
429 	unsigned int size_x = img.dim_x;
430 	unsigned int size_y = img.dim_y;
431 	unsigned int size_z = img.dim_z;
432 
433 	// Compute maximum block size and from that the working memory buffer size
434 	unsigned int kernel_radius = alpha_kernel_radius;
435 	unsigned int kerneldim = 2 * kernel_radius + 1;
436 
437 	bool have_z = (size_z > 1);
438 	unsigned int max_blk_size_xy = have_z ? 16 : 32;
439 	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
440 
441 	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
442 	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
443 
444 	// Perform block-wise averages calculations across the image
445 	// Initialize fields which are not populated until later
446 	ag.arg.size_x = 0;
447 	ag.arg.size_y = 0;
448 	ag.arg.size_z = 0;
449 	ag.arg.offset_x = 0;
450 	ag.arg.offset_y = 0;
451 	ag.arg.offset_z = 0;
452 	ag.arg.work_memory = nullptr;
453 
454 	ag.arg.img = &img;
455 	ag.arg.swz = swz;
456 	ag.arg.have_z = have_z;
457 	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
458 
459 	ag.img_size_x = size_x;
460 	ag.img_size_y = size_y;
461 	ag.img_size_z = size_z;
462 	ag.blk_size_xy = max_blk_size_xy;
463 	ag.blk_size_z = max_blk_size_z;
464 	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
465 
466 	// The parallel task count
467 	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
468 	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
469 	return z_tasks * y_tasks;
470 }
471 
472 #endif
473