1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <stddef.h>
10
11 #include <fxdiv.h>
12
13 #include <xnnpack/indirection.h>
14 #include <xnnpack/operator.h>
15 #include <xnnpack/math.h>
16
17
xnn_indirection_init_conv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)18 void xnn_indirection_init_conv2d(
19 xnn_operator_t op,
20 size_t output_tile_size,
21 uint32_t log2_element_size)
22 {
23 const void** indirection_buffer = op->indirection_buffer;
24 const void* input = op->input;
25 const void* zero = op->zero_buffer;
26 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
27 const size_t input_height = op->input_height;
28 const size_t input_width = op->input_width;
29 const size_t output_height = op->output_height;
30 const size_t output_width = op->output_width;
31 const size_t kernel_height = op->kernel_height;
32 const size_t kernel_width = op->kernel_width;
33 const size_t stride_height = op->stride_height;
34 const size_t stride_width = op->stride_width;
35 const size_t dilation_height = op->dilation_height;
36 const size_t dilation_width = op->dilation_width;
37 const size_t input_padding_top = op->padding_top;
38 const size_t input_padding_left = op->padding_left;
39
40 const size_t output_size = output_height * output_width;
41 const size_t tiled_output_size = round_up(output_size, output_tile_size);
42 const size_t kernel_size = kernel_height * kernel_width;
43
44 const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
45
46 for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
47 for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
48 const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
49 const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
50 const size_t output_x = output_y_x.remainder;
51 const size_t output_y = output_y_x.quotient;
52 for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
53 const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
54 if (input_y < input_height) {
55 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
56 const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
57 const size_t kernel_index = kernel_y * kernel_width + kernel_x;
58 const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
59 if (input_x < input_width) {
60 indirection_buffer[index] = (const void*)
61 ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
62 } else {
63 indirection_buffer[index] = zero;
64 }
65 }
66 } else {
67 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
68 const size_t kernel_index = kernel_y * kernel_width + kernel_x;
69 const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
70 indirection_buffer[index] = zero;
71 }
72 }
73 }
74 }
75 }
76 }
77
xnn_indirection_init_deconv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)78 void xnn_indirection_init_deconv2d(
79 xnn_operator_t op,
80 size_t output_tile_size,
81 uint32_t log2_element_size)
82 {
83 const void** indirection_buffer = op->indirection_buffer;
84 const void* input = op->input;
85 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
86 const void* zero = op->zero_buffer;
87 const size_t input_height = op->input_height;
88 const size_t input_width = op->input_width;
89 const size_t output_height = op->output_height;
90 const size_t output_width = op->output_width;
91 const size_t kernel_height = op->kernel_height;
92 const size_t kernel_width = op->kernel_width;
93 const size_t stride_height = op->stride_height;
94 const size_t stride_width = op->stride_width;
95 const size_t dilation_height = op->dilation_height;
96 const size_t dilation_width = op->dilation_width;
97 const size_t padding_top = op->padding_top;
98 const size_t padding_left = op->padding_left;
99
100 const size_t output_size = output_height * output_width;
101 const size_t tiled_output_size = round_up(output_size, output_tile_size);
102 const size_t kernel_size = kernel_height * kernel_width;
103
104 const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
105 const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height);
106 const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width);
107
108 for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
109 for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
110 const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
111 const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
112 const size_t output_x = output_y_x.remainder;
113 const size_t output_y = output_y_x.quotient;
114 for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
115 const size_t y = output_y + padding_top - kernel_y * dilation_height;
116 const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor);
117 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
118 const size_t x = output_x + padding_left - kernel_x * dilation_width;
119 const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor);
120 const size_t kernel_index = kernel_y * kernel_width + kernel_x;
121 const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
122 if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) {
123 indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
124 } else {
125 indirection_buffer[index] = zero;
126 }
127 }
128 }
129 }
130 }
131 }
132
xnn_indirection_init_subconv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)133 void xnn_indirection_init_subconv2d(
134 xnn_operator_t op,
135 size_t output_tile_size,
136 uint32_t log2_element_size)
137 {
138 const void** indirection_buffer = op->indirection_buffer;
139 struct subconvolution_params* subconvolution_params = op->subconvolution_buffer;
140 const void* input = op->input;
141 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
142 const void* zero = op->zero_buffer;
143 const size_t input_height = op->input_height;
144 const size_t input_width = op->input_width;
145 const size_t output_height = op->output_height;
146 const size_t output_width = op->output_width;
147 const size_t kernel_height = op->kernel_height;
148 const size_t kernel_width = op->kernel_width;
149 const size_t stride_height = op->stride_height;
150 const size_t stride_width = op->stride_width;
151 const size_t padding_top = op->padding_top;
152 const size_t padding_left = op->padding_left;
153
154 const size_t modulo_padding_top = padding_top % stride_height;
155 const size_t modulo_padding_left = padding_left % stride_width;
156 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
157 const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
158 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
159 const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
160 const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width);
161
162 subconvolution_params->indirection_buffer = indirection_buffer;
163 subconvolution_params->indirection_y_stride =
164 subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size);
165 ++subconvolution_params;
166
167 for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) {
168 for (size_t output_tile_start = 0; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) {
169 for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) {
170 assert(doz(output_y + padding_top, kernel_y) % stride_height == 0);
171 const size_t y = output_y + padding_top - kernel_y;
172 const size_t input_y = y / stride_height;
173
174 for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) {
175 for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
176 const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - 1);
177 const size_t output_x = output_x_start + sliced_output_x * stride_width;
178
179 assert(doz(output_x + padding_left, kernel_x) % stride_width == 0);
180 const size_t x = output_x + padding_left - kernel_x;
181 const size_t input_x = x / stride_width;
182
183 if (input_y < input_height && input_x < input_width) {
184 *indirection_buffer++ =
185 (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
186 } else {
187 *indirection_buffer++ = zero;
188 }
189 }
190 }
191 }
192 }
193 }
194 }
195 }
196 }
197
xnn_indirection_init_dwconv2d(xnn_operator_t op,size_t step_height,size_t step_width,uint32_t log2_element_size)198 void xnn_indirection_init_dwconv2d(
199 xnn_operator_t op,
200 size_t step_height,
201 size_t step_width,
202 uint32_t log2_element_size)
203 {
204 const void** indirection_buffer = op->indirection_buffer;
205 const void* input = op->input;
206 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
207 const void* zero = op->zero_buffer;
208 const size_t input_height = op->input_height;
209 const size_t input_width = op->input_width;
210 const size_t output_height = op->output_height;
211 const size_t output_width = op->output_width;
212 const size_t kernel_height = op->kernel_height;
213 const size_t kernel_width = op->kernel_width;
214 const size_t stride_height = op->stride_height;
215 const size_t stride_width = op->stride_width;
216 const size_t dilation_height = op->dilation_height;
217 const size_t dilation_width = op->dilation_width;
218 const size_t input_padding_top = op->padding_top;
219 const size_t input_padding_left = op->padding_left;
220
221 for (size_t output_y = 0; output_y < output_height; output_y++) {
222 for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
223 const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
224 if (input_y < input_height) {
225 for (size_t output_x = 0; output_x < output_width; output_x++) {
226 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
227 const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
228 const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
229 if (input_x < input_width) {
230 indirection_buffer[index] =
231 (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
232 } else {
233 indirection_buffer[index] = zero;
234 }
235 }
236 }
237 } else {
238 for (size_t output_x = 0; output_x < output_width; output_x++) {
239 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
240 const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
241 indirection_buffer[index] = zero;
242 }
243 }
244 }
245 }
246 }
247 }
248
xnn_indirection_init_maxpool2d(xnn_operator_t op,size_t step_height,size_t step_width,uint32_t log2_element_size)249 void xnn_indirection_init_maxpool2d(
250 xnn_operator_t op,
251 size_t step_height,
252 size_t step_width,
253 uint32_t log2_element_size)
254 {
255 const void** indirection_buffer = op->indirection_buffer;
256 const void* input = op->input;
257 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
258 const size_t input_height = op->input_height;
259 const size_t input_width = op->input_width;
260 const size_t output_height = op->output_height;
261 const size_t output_width = op->output_width;
262 const size_t pooling_height = op->kernel_height;
263 const size_t pooling_width = op->kernel_width;
264 const size_t stride_height = op->stride_height;
265 const size_t stride_width = op->stride_width;
266 const size_t dilation_height = op->dilation_height;
267 const size_t dilation_width = op->dilation_width;
268 const size_t input_padding_top = op->padding_top;
269 const size_t input_padding_left = op->padding_left;
270
271 const bool any_dilation = (dilation_height | dilation_width) > 1;
272
273 if (any_dilation) {
274 // Clamp to the border doesn't work for pooling with dilation.
275 const size_t adjusted_padding_top = input_padding_top % dilation_height;
276 const size_t adjusted_padding_left = input_padding_left % dilation_width;
277 for (size_t output_y = 0; output_y < output_height; output_y++) {
278 for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
279 size_t safe_input_y = output_y * stride_height;
280 if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) {
281 safe_input_y += dilation_height;
282 }
283 safe_input_y -= adjusted_padding_top;
284
285 size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top;
286 if XNN_UNPREDICTABLE(input_y >= input_height) {
287 input_y = safe_input_y;
288 }
289
290 for (size_t output_x = 0; output_x < output_width; output_x++) {
291 for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
292 size_t safe_input_x = output_x * stride_width;
293 if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) {
294 safe_input_x += dilation_width;
295 }
296 safe_input_x -= adjusted_padding_left;
297
298 size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left;
299 if XNN_UNPREDICTABLE(input_x >= input_width) {
300 input_x = safe_input_x;
301 }
302
303 const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
304 indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
305 }
306 }
307 }
308 }
309 } else {
310 const size_t input_x_max = input_width - 1;
311 const size_t input_y_max = input_height - 1;
312 for (size_t output_y = 0; output_y < output_height; output_y++) {
313 for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
314 const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max);
315 for (size_t output_x = 0; output_x < output_width; output_x++) {
316 for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
317 const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max);
318 const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
319 indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
320 }
321 }
322 }
323 }
324 }
325 }
326
xnn_indirection_init_resize_bilinear2d_hwc_f32(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,float * packed_weights,bool align_corners,bool tensorflow_legacy)327 void xnn_indirection_init_resize_bilinear2d_hwc_f32(
328 size_t input_pixel_stride,
329 size_t input_height,
330 size_t input_width,
331 size_t output_height,
332 size_t output_width,
333 const void* input,
334 const void** indirection_buffer,
335 float* packed_weights,
336 bool align_corners,
337 bool tensorflow_legacy)
338 {
339 assert(input_height != 0);
340 assert(input_height < 16777216 /* 2**24 */);
341 assert(input_width != 0);
342 assert(input_width < 16777216 /* 2**24 */);
343 assert(output_height != 0);
344 assert(output_height < 16777216 /* 2**24 */);
345 assert(output_width != 0);
346 assert(output_width < 16777216 /* 2**24 */);
347
348 const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
349 const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
350 const float width_scale =
351 (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
352 const float height_scale =
353 (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
354
355 const uint32_t input_y_max = (uint32_t) input_height - 1;
356 const uint32_t input_x_max = (uint32_t) input_width - 1;
357 if (tensorflow_legacy || align_corners) {
358 for (size_t output_y = 0; output_y < output_height; output_y++) {
359 const float input_y = (float) (int32_t) output_y * height_scale;
360 assert(input_y >= 0.0f);
361 assert(input_y < (float) input_height);
362
363 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
364 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
365 const float alpha_y = input_y - (float) input_y_top;
366 for (size_t output_x = 0; output_x < output_width; output_x++) {
367 const float input_x = (float) (int32_t) output_x * width_scale;
368 assert(input_x >= 0.0f);
369 assert(input_x < (float) input_width);
370
371 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
372 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
373 const float alpha_x = input_x - (float) input_x_left;
374 indirection_buffer[0] =
375 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
376 indirection_buffer[1] =
377 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
378 indirection_buffer[2] =
379 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
380 indirection_buffer[3] =
381 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
382 packed_weights[0] = alpha_x;
383 packed_weights[1] = alpha_y;
384 indirection_buffer += 4;
385 packed_weights += 2;
386 }
387 }
388 } else {
389 const float height_offset = 0.5f * height_scale - 0.5f;
390 const float width_offset = 0.5f * width_scale - 0.5f;
391 for (size_t output_y = 0; output_y < output_height; output_y++) {
392 float input_y = (float) (int32_t) output_y * height_scale + height_offset;
393 input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
394 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
395 assert((int32_t) input_y_top >= 0);
396 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
397 const float alpha_y = input_y - (float) input_y_top;
398 for (size_t output_x = 0; output_x < output_width; output_x++) {
399 float input_x = (float) (int32_t) output_x * width_scale + width_offset;
400 input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
401 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
402 assert((int32_t) input_x_left >= 0);
403 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
404 const float alpha_x = input_x - (float) input_x_left;
405 indirection_buffer[0] =
406 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
407 indirection_buffer[1] =
408 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
409 indirection_buffer[2] =
410 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
411 indirection_buffer[3] =
412 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
413 packed_weights[0] = alpha_x;
414 packed_weights[1] = alpha_y;
415 indirection_buffer += 4;
416 packed_weights += 2;
417 }
418 }
419 }
420 }
421
xnn_indirection_init_resize_bilinear2d_chw_f32(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,float * packed_weights,bool align_corners,bool tensorflow_legacy)422 void xnn_indirection_init_resize_bilinear2d_chw_f32(
423 size_t input_pixel_stride,
424 size_t input_height,
425 size_t input_width,
426 size_t output_height,
427 size_t output_width,
428 const void* input,
429 const void** indirection_buffer,
430 float* packed_weights,
431 bool align_corners,
432 bool tensorflow_legacy)
433 {
434 assert(input_height > 1);
435 assert(input_height < 16777216 /* 2**24 */);
436 assert(input_width > 1);
437 assert(input_width < 16777216 /* 2**24 */);
438 assert(output_height != 0);
439 assert(output_height < 16777216 /* 2**24 */);
440 assert(output_width != 0);
441 assert(output_width < 16777216 /* 2**24 */);
442
443 const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
444 const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
445 const float width_scale =
446 (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
447 const float height_scale =
448 (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
449
450 const uint32_t input_y_max = (uint32_t) input_height - 1;
451 const uint32_t input_x_max = (uint32_t) input_width - 1;
452 if (tensorflow_legacy || align_corners) {
453 for (size_t output_y = 0; output_y < output_height; output_y++) {
454 const float input_y = (float) (int32_t) output_y * height_scale;
455 assert(input_y >= 0.0f);
456 assert(input_y < (float) input_height);
457
458 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
459 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
460 const float alpha_y = input_y - (float) input_y_top;
461 for (size_t output_x = 0; output_x < output_width; output_x++) {
462 const float input_x = (float) (int32_t) output_x * width_scale;
463 assert(input_x >= 0.0f);
464 assert(input_x < (float) input_width);
465
466 uint32_t input_x_left = (uint32_t) (int32_t) input_x;
467
468 float alpha_x = input_x - (float) input_x_left;
469 if (input_x_left == input_x_max) {
470 // Ensure that there is a pixel to the right of the one pointed at,
471 // as required by some CHW kernels.
472 --input_x_left;
473 alpha_x = 1.0f;
474 }
475 indirection_buffer[0] =
476 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
477 indirection_buffer[1] =
478 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
479 packed_weights[0] = alpha_x;
480 packed_weights[1] = alpha_y;
481 indirection_buffer += 2;
482 packed_weights += 2;
483 }
484 }
485 } else {
486 const float height_offset = 0.5f * height_scale - 0.5f;
487 const float width_offset = 0.5f * width_scale - 0.5f;
488 for (size_t output_y = 0; output_y < output_height; output_y++) {
489 float input_y = (float) (int32_t) output_y * height_scale + height_offset;
490 input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
491 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
492 assert((int32_t) input_y_top >= 0);
493 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
494 const float alpha_y = input_y - (float) input_y_top;
495 for (size_t output_x = 0; output_x < output_width; output_x++) {
496 float input_x = (float) (int32_t) output_x * width_scale + width_offset;
497 input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
498 uint32_t input_x_left = (uint32_t) (int32_t) input_x;
499 assert((int32_t) input_x_left >= 0);
500
501 float alpha_x = input_x - (float) input_x_left;
502 if (input_x_left == input_x_max) {
503 // Ensure that there is a pixel to the right of the one pointed at,
504 // as required by some CHW kernels.
505 --input_x_left;
506 alpha_x = 1.0f;
507 }
508
509 indirection_buffer[0] =
510 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
511 indirection_buffer[1] =
512 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
513 packed_weights[0] = alpha_x;
514 packed_weights[1] = alpha_y;
515 indirection_buffer += 2;
516 packed_weights += 2;
517 }
518 }
519 }
520 }
521
xnn_indirection_init_unpool2d(xnn_operator_t op,size_t batch_start,uint32_t log2_element_size)522 void xnn_indirection_init_unpool2d(
523 xnn_operator_t op,
524 size_t batch_start,
525 uint32_t log2_element_size)
526 {
527 const void** indirection_buffer = op->indirection_buffer;
528 const void* output = op->output;
529 const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size;
530 const size_t batch_size = op->batch_size;
531 const size_t input_height = op->input_height;
532 const size_t input_width = op->input_width;
533 const size_t output_height = op->output_height;
534 const size_t output_width = op->output_width;
535 const size_t pooling_height = op->kernel_height;
536 const size_t pooling_width = op->kernel_width;
537 const size_t output_padding_top = op->padding_top;
538 const size_t output_padding_left = op->padding_left;
539
540 for (size_t image = batch_start; image < batch_size; image++) {
541 for (size_t input_y = 0; input_y < input_height; input_y++) {
542 for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
543 const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - 1);
544 for (size_t input_x = 0; input_x < input_width; input_x++) {
545 for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
546 const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - 1);
547 indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] =
548 (const void*) ((uintptr_t) output + ((image * output_height + output_y) * output_width + output_x) * output_pixel_stride);
549 }
550 }
551 }
552 }
553 }
554 }
555