1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <stddef.h>
10 
11 #include <fxdiv.h>
12 
13 #include <xnnpack/indirection.h>
14 #include <xnnpack/operator.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_indirection_init_conv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)18 void xnn_indirection_init_conv2d(
19   xnn_operator_t op,
20   size_t output_tile_size,
21   uint32_t log2_element_size)
22 {
23   const void** indirection_buffer          = op->indirection_buffer;
24   const void* input                        = op->input;
25   const void* zero                         = op->zero_buffer;
26   const size_t input_pixel_stride          = op->input_pixel_stride << log2_element_size;
27   const size_t input_height                = op->input_height;
28   const size_t input_width                 = op->input_width;
29   const size_t output_height               = op->output_height;
30   const size_t output_width                = op->output_width;
31   const size_t kernel_height               = op->kernel_height;
32   const size_t kernel_width                = op->kernel_width;
33   const size_t stride_height               = op->stride_height;
34   const size_t stride_width                = op->stride_width;
35   const size_t dilation_height             = op->dilation_height;
36   const size_t dilation_width              = op->dilation_width;
37   const size_t input_padding_top           = op->padding_top;
38   const size_t input_padding_left          = op->padding_left;
39 
40   const size_t output_size = output_height * output_width;
41   const size_t tiled_output_size = round_up(output_size, output_tile_size);
42   const size_t kernel_size = kernel_height * kernel_width;
43 
44   const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
45 
46   for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
47     for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
48       const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
49       const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
50       const size_t output_x = output_y_x.remainder;
51       const size_t output_y = output_y_x.quotient;
52       for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
53         const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
54         if (input_y < input_height) {
55           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
56             const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
57             const size_t kernel_index = kernel_y * kernel_width + kernel_x;
58             const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
59             if (input_x < input_width) {
60               indirection_buffer[index] = (const void*)
61                 ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
62             } else {
63               indirection_buffer[index] = zero;
64             }
65           }
66         } else {
67           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
68             const size_t kernel_index = kernel_y * kernel_width + kernel_x;
69             const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
70             indirection_buffer[index] = zero;
71           }
72         }
73       }
74     }
75   }
76 }
77 
xnn_indirection_init_deconv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)78 void xnn_indirection_init_deconv2d(
79   xnn_operator_t op,
80   size_t output_tile_size,
81   uint32_t log2_element_size)
82 {
83   const void** indirection_buffer = op->indirection_buffer;
84   const void* input               = op->input;
85   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
86   const void* zero                = op->zero_buffer;
87   const size_t input_height       = op->input_height;
88   const size_t input_width        = op->input_width;
89   const size_t output_height      = op->output_height;
90   const size_t output_width       = op->output_width;
91   const size_t kernel_height      = op->kernel_height;
92   const size_t kernel_width       = op->kernel_width;
93   const size_t stride_height      = op->stride_height;
94   const size_t stride_width       = op->stride_width;
95   const size_t dilation_height    = op->dilation_height;
96   const size_t dilation_width     = op->dilation_width;
97   const size_t padding_top        = op->padding_top;
98   const size_t padding_left       = op->padding_left;
99 
100   const size_t output_size = output_height * output_width;
101   const size_t tiled_output_size = round_up(output_size, output_tile_size);
102   const size_t kernel_size = kernel_height * kernel_width;
103 
104   const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
105   const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height);
106   const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width);
107 
108   for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
109     for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
110       const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
111       const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
112       const size_t output_x = output_y_x.remainder;
113       const size_t output_y = output_y_x.quotient;
114       for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
115         const size_t y = output_y + padding_top - kernel_y * dilation_height;
116         const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor);
117         for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
118           const size_t x = output_x + padding_left - kernel_x * dilation_width;
119           const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor);
120           const size_t kernel_index = kernel_y * kernel_width + kernel_x;
121           const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
122           if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) {
123             indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
124           } else {
125             indirection_buffer[index] = zero;
126           }
127         }
128       }
129     }
130   }
131 }
132 
xnn_indirection_init_subconv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)133 void xnn_indirection_init_subconv2d(
134   xnn_operator_t op,
135   size_t output_tile_size,
136   uint32_t log2_element_size)
137 {
138   const void** indirection_buffer                     = op->indirection_buffer;
139   struct subconvolution_params* subconvolution_params = op->subconvolution_buffer;
140   const void* input                                   = op->input;
141   const size_t input_pixel_stride                     = op->input_pixel_stride << log2_element_size;
142   const void* zero                                    = op->zero_buffer;
143   const size_t input_height                           = op->input_height;
144   const size_t input_width                            = op->input_width;
145   const size_t output_height                          = op->output_height;
146   const size_t output_width                           = op->output_width;
147   const size_t kernel_height                          = op->kernel_height;
148   const size_t kernel_width                           = op->kernel_width;
149   const size_t stride_height                          = op->stride_height;
150   const size_t stride_width                           = op->stride_width;
151   const size_t padding_top                            = op->padding_top;
152   const size_t padding_left                           = op->padding_left;
153 
154   const size_t modulo_padding_top = padding_top % stride_height;
155   const size_t modulo_padding_left = padding_left % stride_width;
156   for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
157     const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
158     for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
159       const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
160       const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width);
161 
162       subconvolution_params->indirection_buffer = indirection_buffer;
163       subconvolution_params->indirection_y_stride =
164         subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size);
165       ++subconvolution_params;
166 
167       for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) {
168         for (size_t output_tile_start = 0; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) {
169           for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) {
170             assert(doz(output_y + padding_top, kernel_y) % stride_height == 0);
171             const size_t y = output_y + padding_top - kernel_y;
172             const size_t input_y = y / stride_height;
173 
174             for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) {
175               for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
176                 const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - 1);
177                 const size_t output_x = output_x_start + sliced_output_x * stride_width;
178 
179                 assert(doz(output_x + padding_left, kernel_x) % stride_width == 0);
180                 const size_t x = output_x + padding_left - kernel_x;
181                 const size_t input_x = x / stride_width;
182 
183                 if (input_y < input_height && input_x < input_width) {
184                   *indirection_buffer++ =
185                     (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
186                 } else {
187                   *indirection_buffer++ = zero;
188                 }
189               }
190             }
191           }
192         }
193       }
194     }
195   }
196 }
197 
xnn_indirection_init_dwconv2d(xnn_operator_t op,size_t step_height,size_t step_width,uint32_t log2_element_size)198 void xnn_indirection_init_dwconv2d(
199   xnn_operator_t op,
200   size_t step_height,
201   size_t step_width,
202   uint32_t log2_element_size)
203 {
204   const void** indirection_buffer = op->indirection_buffer;
205   const void* input               = op->input;
206   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
207   const void* zero                = op->zero_buffer;
208   const size_t input_height       = op->input_height;
209   const size_t input_width        = op->input_width;
210   const size_t output_height      = op->output_height;
211   const size_t output_width       = op->output_width;
212   const size_t kernel_height      = op->kernel_height;
213   const size_t kernel_width       = op->kernel_width;
214   const size_t stride_height      = op->stride_height;
215   const size_t stride_width       = op->stride_width;
216   const size_t dilation_height    = op->dilation_height;
217   const size_t dilation_width     = op->dilation_width;
218   const size_t input_padding_top  = op->padding_top;
219   const size_t input_padding_left = op->padding_left;
220 
221   for (size_t output_y = 0; output_y < output_height; output_y++) {
222     for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
223       const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
224       if (input_y < input_height) {
225         for (size_t output_x = 0; output_x < output_width; output_x++) {
226           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
227             const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
228             const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
229             if (input_x < input_width) {
230               indirection_buffer[index] =
231                 (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
232             } else {
233               indirection_buffer[index] = zero;
234             }
235           }
236         }
237       } else {
238         for (size_t output_x = 0; output_x < output_width; output_x++) {
239           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
240             const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
241             indirection_buffer[index] = zero;
242           }
243         }
244       }
245     }
246   }
247 }
248 
xnn_indirection_init_maxpool2d(xnn_operator_t op,size_t step_height,size_t step_width,uint32_t log2_element_size)249 void xnn_indirection_init_maxpool2d(
250   xnn_operator_t op,
251   size_t step_height,
252   size_t step_width,
253   uint32_t log2_element_size)
254 {
255   const void** indirection_buffer = op->indirection_buffer;
256   const void* input               = op->input;
257   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
258   const size_t input_height       = op->input_height;
259   const size_t input_width        = op->input_width;
260   const size_t output_height      = op->output_height;
261   const size_t output_width       = op->output_width;
262   const size_t pooling_height     = op->kernel_height;
263   const size_t pooling_width      = op->kernel_width;
264   const size_t stride_height      = op->stride_height;
265   const size_t stride_width       = op->stride_width;
266   const size_t dilation_height    = op->dilation_height;
267   const size_t dilation_width     = op->dilation_width;
268   const size_t input_padding_top  = op->padding_top;
269   const size_t input_padding_left = op->padding_left;
270 
271   const bool any_dilation = (dilation_height | dilation_width) > 1;
272 
273   if (any_dilation) {
274     // Clamp to the border doesn't work for pooling with dilation.
275     const size_t adjusted_padding_top = input_padding_top % dilation_height;
276     const size_t adjusted_padding_left = input_padding_left % dilation_width;
277     for (size_t output_y = 0; output_y < output_height; output_y++) {
278       for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
279         size_t safe_input_y = output_y * stride_height;
280         if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) {
281           safe_input_y += dilation_height;
282         }
283         safe_input_y -= adjusted_padding_top;
284 
285         size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top;
286         if XNN_UNPREDICTABLE(input_y >= input_height) {
287           input_y = safe_input_y;
288         }
289 
290         for (size_t output_x = 0; output_x < output_width; output_x++) {
291           for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
292             size_t safe_input_x = output_x * stride_width;
293             if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) {
294               safe_input_x += dilation_width;
295             }
296             safe_input_x -= adjusted_padding_left;
297 
298             size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left;
299             if XNN_UNPREDICTABLE(input_x >= input_width) {
300               input_x = safe_input_x;
301             }
302 
303             const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
304             indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
305           }
306         }
307       }
308     }
309   } else {
310     const size_t input_x_max = input_width - 1;
311     const size_t input_y_max = input_height - 1;
312     for (size_t output_y = 0; output_y < output_height; output_y++) {
313       for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
314         const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max);
315         for (size_t output_x = 0; output_x < output_width; output_x++) {
316           for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
317             const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max);
318             const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
319             indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
320           }
321         }
322       }
323     }
324   }
325 }
326 
xnn_indirection_init_resize_bilinear2d_hwc_f32(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,float * packed_weights,bool align_corners,bool tensorflow_legacy)327 void xnn_indirection_init_resize_bilinear2d_hwc_f32(
328   size_t input_pixel_stride,
329   size_t input_height,
330   size_t input_width,
331   size_t output_height,
332   size_t output_width,
333   const void* input,
334   const void** indirection_buffer,
335   float* packed_weights,
336   bool align_corners,
337   bool tensorflow_legacy)
338 {
339   assert(input_height != 0);
340   assert(input_height < 16777216 /* 2**24 */);
341   assert(input_width != 0);
342   assert(input_width < 16777216 /* 2**24 */);
343   assert(output_height != 0);
344   assert(output_height < 16777216 /* 2**24 */);
345   assert(output_width != 0);
346   assert(output_width < 16777216 /* 2**24 */);
347 
348   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
349   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
350   const float width_scale =
351     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
352   const float height_scale =
353     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
354 
355   const uint32_t input_y_max = (uint32_t) input_height - 1;
356   const uint32_t input_x_max = (uint32_t) input_width - 1;
357   if (tensorflow_legacy || align_corners) {
358     for (size_t output_y = 0; output_y < output_height; output_y++) {
359       const float input_y = (float) (int32_t) output_y * height_scale;
360       assert(input_y >= 0.0f);
361       assert(input_y < (float) input_height);
362 
363       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
364       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
365       const float alpha_y = input_y - (float) input_y_top;
366       for (size_t output_x = 0; output_x < output_width; output_x++) {
367         const float input_x = (float) (int32_t) output_x * width_scale;
368         assert(input_x >= 0.0f);
369         assert(input_x < (float) input_width);
370 
371         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
372         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
373         const float alpha_x = input_x - (float) input_x_left;
374         indirection_buffer[0] =
375           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
376         indirection_buffer[1] =
377           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
378         indirection_buffer[2] =
379           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
380         indirection_buffer[3] =
381           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
382         packed_weights[0] = alpha_x;
383         packed_weights[1] = alpha_y;
384         indirection_buffer += 4;
385         packed_weights += 2;
386       }
387     }
388   } else {
389     const float height_offset = 0.5f * height_scale - 0.5f;
390     const float width_offset = 0.5f * width_scale - 0.5f;
391     for (size_t output_y = 0; output_y < output_height; output_y++) {
392       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
393       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
394       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
395       assert((int32_t) input_y_top >= 0);
396       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
397       const float alpha_y = input_y - (float) input_y_top;
398       for (size_t output_x = 0; output_x < output_width; output_x++) {
399         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
400         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
401         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
402         assert((int32_t) input_x_left >= 0);
403         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
404         const float alpha_x = input_x - (float) input_x_left;
405         indirection_buffer[0] =
406           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
407         indirection_buffer[1] =
408           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
409         indirection_buffer[2] =
410           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
411         indirection_buffer[3] =
412           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
413         packed_weights[0] = alpha_x;
414         packed_weights[1] = alpha_y;
415         indirection_buffer += 4;
416         packed_weights += 2;
417       }
418     }
419   }
420 }
421 
xnn_indirection_init_resize_bilinear2d_chw_f32(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,float * packed_weights,bool align_corners,bool tensorflow_legacy)422 void xnn_indirection_init_resize_bilinear2d_chw_f32(
423   size_t input_pixel_stride,
424   size_t input_height,
425   size_t input_width,
426   size_t output_height,
427   size_t output_width,
428   const void* input,
429   const void** indirection_buffer,
430   float* packed_weights,
431   bool align_corners,
432   bool tensorflow_legacy)
433 {
434   assert(input_height > 1);
435   assert(input_height < 16777216 /* 2**24 */);
436   assert(input_width > 1);
437   assert(input_width < 16777216 /* 2**24 */);
438   assert(output_height != 0);
439   assert(output_height < 16777216 /* 2**24 */);
440   assert(output_width != 0);
441   assert(output_width < 16777216 /* 2**24 */);
442 
443   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
444   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
445   const float width_scale =
446     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
447   const float height_scale =
448     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
449 
450   const uint32_t input_y_max = (uint32_t) input_height - 1;
451   const uint32_t input_x_max = (uint32_t) input_width - 1;
452   if (tensorflow_legacy || align_corners) {
453     for (size_t output_y = 0; output_y < output_height; output_y++) {
454       const float input_y = (float) (int32_t) output_y * height_scale;
455       assert(input_y >= 0.0f);
456       assert(input_y < (float) input_height);
457 
458       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
459       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
460       const float alpha_y = input_y - (float) input_y_top;
461       for (size_t output_x = 0; output_x < output_width; output_x++) {
462         const float input_x = (float) (int32_t) output_x * width_scale;
463         assert(input_x >= 0.0f);
464         assert(input_x < (float) input_width);
465 
466         uint32_t input_x_left = (uint32_t) (int32_t) input_x;
467 
468         float alpha_x = input_x - (float) input_x_left;
469         if (input_x_left == input_x_max) {
470           // Ensure that there is a pixel to the right of the one pointed at,
471           // as required by some CHW kernels.
472           --input_x_left;
473           alpha_x = 1.0f;
474         }
475        indirection_buffer[0] =
476           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
477        indirection_buffer[1] =
478           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
479         packed_weights[0] = alpha_x;
480         packed_weights[1] = alpha_y;
481         indirection_buffer += 2;
482         packed_weights += 2;
483       }
484     }
485   } else {
486     const float height_offset = 0.5f * height_scale - 0.5f;
487     const float width_offset = 0.5f * width_scale - 0.5f;
488     for (size_t output_y = 0; output_y < output_height; output_y++) {
489       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
490       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
491       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
492       assert((int32_t) input_y_top >= 0);
493       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
494       const float alpha_y = input_y - (float) input_y_top;
495       for (size_t output_x = 0; output_x < output_width; output_x++) {
496         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
497         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
498         uint32_t input_x_left = (uint32_t) (int32_t) input_x;
499         assert((int32_t) input_x_left >= 0);
500 
501         float alpha_x = input_x - (float) input_x_left;
502         if (input_x_left == input_x_max) {
503           // Ensure that there is a pixel to the right of the one pointed at,
504           // as required by some CHW kernels.
505           --input_x_left;
506           alpha_x = 1.0f;
507         }
508 
509         indirection_buffer[0] =
510           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
511         indirection_buffer[1] =
512           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
513         packed_weights[0] = alpha_x;
514         packed_weights[1] = alpha_y;
515         indirection_buffer += 2;
516         packed_weights += 2;
517       }
518     }
519   }
520 }
521 
xnn_indirection_init_unpool2d(xnn_operator_t op,size_t batch_start,uint32_t log2_element_size)522 void xnn_indirection_init_unpool2d(
523   xnn_operator_t op,
524   size_t batch_start,
525   uint32_t log2_element_size)
526 {
527   const void** indirection_buffer  = op->indirection_buffer;
528   const void* output               = op->output;
529   const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size;
530   const size_t batch_size          = op->batch_size;
531   const size_t input_height        = op->input_height;
532   const size_t input_width         = op->input_width;
533   const size_t output_height       = op->output_height;
534   const size_t output_width        = op->output_width;
535   const size_t pooling_height      = op->kernel_height;
536   const size_t pooling_width       = op->kernel_width;
537   const size_t output_padding_top  = op->padding_top;
538   const size_t output_padding_left = op->padding_left;
539 
540   for (size_t image = batch_start; image < batch_size; image++) {
541     for (size_t input_y = 0; input_y < input_height; input_y++) {
542       for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
543         const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - 1);
544         for (size_t input_x = 0; input_x < input_width; input_x++) {
545           for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
546             const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - 1);
547             indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] =
548               (const void*) ((uintptr_t) output + ((image * output_height + output_y) * output_width + output_x) * output_pixel_stride);
549           }
550         }
551       }
552     }
553   }
554 }
555