1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16 
17 #include <xnnpack.h>
18 #include <xnnpack/allocator.h>
19 #include <xnnpack/common.h>
20 #include <xnnpack/indirection.h>
21 #include <xnnpack/log.h>
22 #include <xnnpack/math.h>
23 #include <xnnpack/operator.h>
24 #include <xnnpack/params-init.h>
25 #include <xnnpack/params.h>
26 
27 
compute_output_dimension(size_t padded_input_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t stride_dimension)28 static inline size_t compute_output_dimension(
29     size_t padded_input_dimension,
30     size_t kernel_dimension,
31     size_t dilation_dimension,
32     size_t stride_dimension)
33 {
34   const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
35   return (padded_input_dimension - effective_kernel_dimension) / stride_dimension + 1;
36 }
37 
compute_output_dimension_with_tf_same_padding(size_t input_dimension,size_t stride_dimension)38 static inline size_t compute_output_dimension_with_tf_same_padding(
39     size_t input_dimension,
40     size_t stride_dimension)
41 {
42   return divide_round_up(input_dimension, stride_dimension);
43 }
44 
create_max_pooling2d_nhwc(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t pooling_height,uint32_t pooling_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,size_t channels,size_t input_pixel_stride,size_t output_pixel_stride,uint32_t flags,const void * params,size_t params_size,uint32_t datatype_init_flags,enum xnn_operator_type operator_type,xnn_operator_t * max_pooling_op_out)45 static enum xnn_status create_max_pooling2d_nhwc(
46     uint32_t input_padding_top,
47     uint32_t input_padding_right,
48     uint32_t input_padding_bottom,
49     uint32_t input_padding_left,
50     uint32_t pooling_height,
51     uint32_t pooling_width,
52     uint32_t stride_height,
53     uint32_t stride_width,
54     uint32_t dilation_height,
55     uint32_t dilation_width,
56     size_t channels,
57     size_t input_pixel_stride,
58     size_t output_pixel_stride,
59     uint32_t flags,
60     const void* params,
61     size_t params_size,
62     uint32_t datatype_init_flags,
63     enum xnn_operator_type operator_type,
64     xnn_operator_t* max_pooling_op_out)
65 {
66   xnn_operator_t max_pooling_op = NULL;
67   enum xnn_status status = xnn_status_uninitialized;
68 
69   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
70     xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
71       xnn_operator_type_to_string(operator_type));
72     return xnn_status_uninitialized;
73   }
74 
75   status = xnn_status_unsupported_hardware;
76 
77   if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
78     xnn_log_error(
79       "failed to create %s operator: operations on data type are not supported",
80       xnn_operator_type_to_string(operator_type));
81     goto error;
82   }
83 
84   status = xnn_status_invalid_parameter;
85 
86   const uint32_t pooling_size = pooling_height * pooling_width;
87   if (pooling_size == 0) {
88     xnn_log_error(
89       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " pooling size: "
90       "pooling size dimensions must be non-zero",
91       xnn_operator_type_to_string(operator_type),
92       pooling_width, pooling_height);
93     goto error;
94   }
95 
96   if (pooling_size == 1) {
97     xnn_log_error(
98       "failed to create %s operator with 1 pooling element: 1x1 pooling is meaningless",
99       xnn_operator_type_to_string(operator_type));
100     goto error;
101   }
102 
103   if (stride_height == 0 || stride_width == 0) {
104     xnn_log_error(
105       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
106       xnn_operator_type_to_string(operator_type), stride_width, stride_height);
107     goto error;
108   }
109 
110   if (dilation_height == 0 || dilation_width == 0) {
111     xnn_log_error(
112       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
113       xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
114     goto error;
115   }
116 
117   if (channels == 0) {
118     xnn_log_error(
119       "failed to create %s operator with %zu channels: number of channels must be non-zero",
120       xnn_operator_type_to_string(operator_type), channels);
121     goto error;
122   }
123 
124   if (input_pixel_stride < channels) {
125     xnn_log_error(
126       "failed to create %s operator with input pixel stride of %zu: "
127       "stride must be at least as large as the number of channels (%zu)",
128       xnn_operator_type_to_string(operator_type), input_pixel_stride, channels);
129     goto error;
130   }
131 
132   if (output_pixel_stride < channels) {
133     xnn_log_error(
134       "failed to create %s operator with output pixel stride of %zu: "
135       "stride must be at least as large as the number of channels (%zu)",
136       xnn_operator_type_to_string(operator_type), output_pixel_stride, channels);
137     goto error;
138   }
139 
140   const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
141   if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
142     if (any_padding) {
143       xnn_log_error(
144         "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
145         "TensorFlow SAME padding can't be combined with explicit padding specification",
146         xnn_operator_type_to_string(operator_type),
147         input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
148       goto error;
149     }
150   }
151 
152   status = xnn_status_out_of_memory;
153 
154   max_pooling_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
155   if (max_pooling_op == NULL) {
156     xnn_log_error(
157       "failed to allocate %zu bytes for %s operator descriptor",
158       sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
159     goto error;
160   }
161 
162   max_pooling_op->padding_top = input_padding_top;
163   max_pooling_op->padding_right = input_padding_right;
164   max_pooling_op->padding_bottom = input_padding_bottom;
165   max_pooling_op->padding_left = input_padding_left;
166 
167   max_pooling_op->kernel_height = pooling_height;
168   max_pooling_op->kernel_width = pooling_width;
169   max_pooling_op->stride_height = stride_height;
170   max_pooling_op->stride_width = stride_width;
171   max_pooling_op->dilation_height = dilation_height;
172   max_pooling_op->dilation_width = dilation_width;
173   max_pooling_op->channels = channels;
174   max_pooling_op->input_pixel_stride = input_pixel_stride;
175   max_pooling_op->output_pixel_stride = output_pixel_stride;
176 
177   memcpy(&max_pooling_op->params, params, params_size);
178   max_pooling_op->type = operator_type;
179   max_pooling_op->flags = flags;
180 
181   max_pooling_op->state = xnn_run_state_invalid;
182 
183   *max_pooling_op_out = max_pooling_op;
184   return xnn_status_success;
185 
186 error:
187   xnn_delete_operator(max_pooling_op);
188   return status;
189 }
190 
setup_max_pooling2d_nhwc(xnn_operator_t max_pooling_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,uint32_t log2_input_element_size,uint32_t log2_output_element_size,struct maxpool_parameters maxpool[restrict XNN_MIN_ELEMENTS (1)],const void * params,size_t params_size,size_t num_threads)191 static enum xnn_status setup_max_pooling2d_nhwc(
192   xnn_operator_t max_pooling_op,
193   size_t batch_size,
194   size_t input_height,
195   size_t input_width,
196   const void* input,
197   void* output,
198   uint32_t log2_input_element_size,
199   uint32_t log2_output_element_size,
200   struct maxpool_parameters maxpool[restrict XNN_MIN_ELEMENTS(1)],
201   const void* params,
202   size_t params_size,
203   size_t num_threads)
204 {
205   max_pooling_op->state = xnn_run_state_invalid;
206 
207   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
208     xnn_log_error(
209       "failed to setup %s operator: XNNPACK is not initialized",
210       xnn_operator_type_to_string(max_pooling_op->type));
211     return xnn_status_uninitialized;
212   }
213 
214   if (input_width == 0 || input_height == 0) {
215     xnn_log_error(
216       "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
217       xnn_operator_type_to_string(max_pooling_op->type), input_width, input_height);
218     return xnn_status_invalid_parameter;
219   }
220 
221   if (batch_size == 0) {
222     max_pooling_op->state = xnn_run_state_skip;
223     return xnn_status_success;
224   }
225 
226   max_pooling_op->input_height = input_height;
227   max_pooling_op->input_width = input_width;
228   max_pooling_op->input = input;
229 
230   if (max_pooling_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
231     max_pooling_op->output_height = compute_output_dimension_with_tf_same_padding(
232         input_height, max_pooling_op->stride_height);
233     max_pooling_op->output_width = compute_output_dimension_with_tf_same_padding(
234         input_width, max_pooling_op->stride_width);
235 
236     const uint32_t effective_kernel_height = (max_pooling_op->kernel_height - 1) * max_pooling_op->dilation_height + 1;
237     const uint32_t effective_kernel_width = (max_pooling_op->kernel_width - 1) * max_pooling_op->dilation_width + 1;
238     const uint32_t total_padding_height =
239       doz((max_pooling_op->output_height - 1) * max_pooling_op->stride_height + effective_kernel_height, input_height);
240     const uint32_t total_padding_width =
241       doz((max_pooling_op->output_width - 1) * max_pooling_op->stride_width + effective_kernel_width, input_width);
242     max_pooling_op->padding_top = total_padding_height / 2;
243     max_pooling_op->padding_left = total_padding_width / 2;
244     max_pooling_op->padding_bottom = total_padding_height - max_pooling_op->padding_top;
245     max_pooling_op->padding_right = total_padding_width - max_pooling_op->padding_left;
246   } else {
247     max_pooling_op->output_height = compute_output_dimension(
248         max_pooling_op->padding_top + input_height + max_pooling_op->padding_bottom,
249         max_pooling_op->kernel_height,
250         max_pooling_op->dilation_height,
251         max_pooling_op->stride_height);
252     max_pooling_op->output_width = compute_output_dimension(
253         max_pooling_op->padding_left + input_width + max_pooling_op->padding_right,
254         max_pooling_op->kernel_width,
255         max_pooling_op->dilation_width,
256         max_pooling_op->stride_width);
257   }
258 
259   const size_t pooling_height = max_pooling_op->kernel_height;
260   const size_t pooling_width = max_pooling_op->kernel_width;
261   const size_t pooling_size = pooling_height * pooling_width;
262   const size_t output_height = max_pooling_op->output_height;
263   const size_t output_width = max_pooling_op->output_width;
264   const uint32_t mr = maxpool->mr;
265 
266   const size_t step_width =
267     max_pooling_op->dilation_width > 1 ? pooling_width : min(max_pooling_op->stride_width, pooling_width);
268   const size_t step_height = pooling_size + (output_width - 1) * step_width * pooling_height;
269 
270   if (input_height != max_pooling_op->last_input_height ||
271       input_width != max_pooling_op->last_input_width)
272   {
273     // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
274     const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + output_height * step_height);
275     const void** indirection_buffer =
276       (const void**) xnn_reallocate_memory(max_pooling_op->indirection_buffer, indirection_buffer_size);
277     if (indirection_buffer == NULL) {
278       xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
279       return xnn_status_out_of_memory;
280     }
281     max_pooling_op->indirection_buffer = indirection_buffer;
282 
283     xnn_indirection_init_maxpool2d(max_pooling_op, step_height, step_width, log2_input_element_size);
284 
285     max_pooling_op->last_input = input;
286     max_pooling_op->last_input_height = input_height;
287     max_pooling_op->last_input_width = input_width;
288   }
289 
290   const uint32_t qr = maxpool->qr;
291   const size_t channels = max_pooling_op->channels;
292 
293   const size_t indirect_input_height_stride = step_height * sizeof(void*);
294   const size_t output_width_stride = max_pooling_op->output_pixel_stride << log2_output_element_size;
295   const size_t output_height_stride = output_width * output_width_stride;
296   const size_t multipass_adjustment = round_up(doz(pooling_size, mr), qr) + mr;
297 
298   max_pooling_op->context.max_pooling = (struct max_pooling_context) {
299     .indirect_input = max_pooling_op->indirection_buffer,
300     .indirect_input_height_stride = indirect_input_height_stride,
301     .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) max_pooling_op->last_input),
302     .input_batch_stride = (input_height * input_width * max_pooling_op->input_pixel_stride) << log2_input_element_size,
303     .output = output,
304     .output_batch_stride = output_height * output_height_stride,
305     .output_height_stride = output_height_stride,
306     .output_width = output_width,
307     .pooling_size = pooling_size,
308     .channels = channels,
309     .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
310     .output_increment = output_width_stride - (channels << log2_output_element_size),
311     .ukernel = maxpool->ukernel,
312   };
313   memcpy(&max_pooling_op->context.max_pooling.params, params, params_size);
314 
315   max_pooling_op->compute.type = xnn_parallelization_type_2d;
316   max_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_max_pooling;
317   max_pooling_op->compute.range[0] = batch_size;
318   max_pooling_op->compute.range[1] = output_height;
319   max_pooling_op->state = xnn_run_state_ready;
320 
321   return xnn_status_success;
322 }
323 
xnn_create_max_pooling2d_nhwc_u8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t pooling_height,uint32_t pooling_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,size_t channels,size_t input_pixel_stride,size_t output_pixel_stride,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * max_pooling_op_out)324 enum xnn_status xnn_create_max_pooling2d_nhwc_u8(
325     uint32_t input_padding_top,
326     uint32_t input_padding_right,
327     uint32_t input_padding_bottom,
328     uint32_t input_padding_left,
329     uint32_t pooling_height,
330     uint32_t pooling_width,
331     uint32_t stride_height,
332     uint32_t stride_width,
333     uint32_t dilation_height,
334     uint32_t dilation_width,
335     size_t channels,
336     size_t input_pixel_stride,
337     size_t output_pixel_stride,
338     uint8_t output_min,
339     uint8_t output_max,
340     uint32_t flags,
341     xnn_operator_t* max_pooling_op_out)
342 {
343   if (output_min >= output_max) {
344     xnn_log_error(
345       "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
346       xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_u8), output_min, output_max);
347     return xnn_status_invalid_parameter;
348   }
349 
350   const union xnn_u8_minmax_params params = xnn_init_u8_minmax_params(output_min, output_max);
351   return create_max_pooling2d_nhwc(
352     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
353     pooling_height, pooling_width,
354     stride_height, stride_width,
355     dilation_height, dilation_width,
356     channels, input_pixel_stride, output_pixel_stride,
357     flags,
358     &params, sizeof(params), XNN_INIT_FLAG_U8,
359     xnn_operator_type_max_pooling_nhwc_u8,
360     max_pooling_op_out);
361 }
362 
xnn_create_max_pooling2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t pooling_height,uint32_t pooling_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,size_t channels,size_t input_pixel_stride,size_t output_pixel_stride,float output_min,float output_max,uint32_t flags,xnn_operator_t * max_pooling_op_out)363 enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
364     uint32_t input_padding_top,
365     uint32_t input_padding_right,
366     uint32_t input_padding_bottom,
367     uint32_t input_padding_left,
368     uint32_t pooling_height,
369     uint32_t pooling_width,
370     uint32_t stride_height,
371     uint32_t stride_width,
372     uint32_t dilation_height,
373     uint32_t dilation_width,
374     size_t channels,
375     size_t input_pixel_stride,
376     size_t output_pixel_stride,
377     float output_min,
378     float output_max,
379     uint32_t flags,
380     xnn_operator_t* max_pooling_op_out)
381 {
382   if (isnan(output_min)) {
383     xnn_log_error(
384       "failed to create %s with NaN output lower bound: lower bound must be non-NaN",
385       xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f32));
386     return xnn_status_invalid_parameter;
387   }
388 
389   if (isnan(output_max)) {
390     xnn_log_error(
391       "failed to create %s with NaN output upper bound: upper bound must be non-NaN",
392       xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f32));
393     return xnn_status_invalid_parameter;
394   }
395 
396   if (output_min >= output_max) {
397     xnn_log_error(
398       "failed to create %s with [%.7g, %.7g] output range: lower bound must be below upper bound",
399       xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f32), output_min, output_max);
400     return xnn_status_invalid_parameter;
401   }
402 
403   const union xnn_f32_minmax_params params = xnn_init_f32_minmax_params(output_min, output_max);
404   return create_max_pooling2d_nhwc(
405     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
406     pooling_height, pooling_width,
407     stride_height, stride_width,
408     dilation_height, dilation_width,
409     channels, input_pixel_stride, output_pixel_stride,
410     flags,
411     &params, sizeof(params), XNN_INIT_FLAG_F32,
412     xnn_operator_type_max_pooling_nhwc_f32,
413     max_pooling_op_out);
414 }
415 
xnn_setup_max_pooling2d_nhwc_u8(xnn_operator_t max_pooling_op,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)416 enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
417     xnn_operator_t max_pooling_op,
418     size_t batch_size,
419     size_t input_height,
420     size_t input_width,
421     const uint8_t* input,
422     uint8_t* output,
423     pthreadpool_t threadpool)
424 {
425   if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_u8) {
426     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
427       xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_u8),
428       xnn_operator_type_to_string(max_pooling_op->type));
429     return xnn_status_invalid_parameter;
430   }
431 
432   return setup_max_pooling2d_nhwc(
433     max_pooling_op,
434     batch_size, input_height, input_width,
435     input, output,
436     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
437     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
438     &xnn_params.u8.maxpool,
439     &max_pooling_op->params.u8_minmax, sizeof(max_pooling_op->params.u8_minmax),
440     pthreadpool_get_threads_count(threadpool));
441 }
442 
xnn_setup_max_pooling2d_nhwc_f32(xnn_operator_t max_pooling_op,size_t batch_size,size_t input_height,size_t input_width,const float * input,float * output,pthreadpool_t threadpool)443 enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
444     xnn_operator_t max_pooling_op,
445     size_t batch_size,
446     size_t input_height,
447     size_t input_width,
448     const float* input,
449     float* output,
450     pthreadpool_t threadpool)
451 {
452   if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_f32) {
453     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
454       xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f32),
455       xnn_operator_type_to_string(max_pooling_op->type));
456     return xnn_status_invalid_parameter;
457   }
458 
459   return setup_max_pooling2d_nhwc(
460     max_pooling_op,
461     batch_size, input_height, input_width,
462     input, output,
463     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
464     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
465     &xnn_params.f32.maxpool,
466     &max_pooling_op->params.f32_minmax, sizeof(max_pooling_op->params.f32_minmax),
467     pthreadpool_get_threads_count(threadpool));
468 }
469 
470