1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16 
17 #include <xnnpack.h>
18 #include <xnnpack/allocator.h>
19 #include <xnnpack/common.h>
20 #include <xnnpack/compute.h>
21 #include <xnnpack/indirection.h>
22 #include <xnnpack/log.h>
23 #include <xnnpack/math.h>
24 #include <xnnpack/operator.h>
25 #include <xnnpack/pack.h>
26 #include <xnnpack/params-init.h>
27 #include <xnnpack/params.h>
28 
29 
compute_output_dimension(size_t padded_input_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t subsampling_dimension)30 static inline size_t compute_output_dimension(
31     size_t padded_input_dimension,
32     size_t kernel_dimension,
33     size_t dilation_dimension,
34     size_t subsampling_dimension)
35 {
36   const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
37   return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + 1;
38 }
39 
compute_output_dimension_with_tf_same_padding(size_t input_dimension,size_t subsampling_dimension)40 static inline size_t compute_output_dimension_with_tf_same_padding(
41     size_t input_dimension,
42     size_t subsampling_dimension)
43 {
44   return divide_round_up(input_dimension, subsampling_dimension);
45 }
46 
find_dwconv_ukernel(size_t kernel_size,const struct dwconv_parameters * ukernel,size_t num_ukernels)47 static const struct dwconv_parameters* find_dwconv_ukernel(
48     size_t kernel_size,
49     const struct dwconv_parameters* ukernel,
50     size_t num_ukernels)
51 {
52   while (num_ukernels-- != 0) {
53     if (ukernel->primary_tile == kernel_size) {
54       return ukernel;
55     }
56     ukernel++;
57   }
58   return NULL;
59 }
60 
create_convolution2d_nhwc(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const void * kernel,const void * bias,uint32_t flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w,xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w,xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w,xnn_pack_gemm_goi_w_function pack_gemm_goi_w,xnn_pack_conv_kgo_w_function pack_conv_kgo_w,xnn_pack_conv_goki_w_function pack_conv_goki_w,const void * packing_params,int input_padding_byte,int packed_weights_padding_byte,const void * params,size_t params_size,const struct gemm_parameters * gemm_parameters,const struct dwconv_parameters * dwconv_parameters,size_t num_dwconv_parameters,const struct vmulcaddc_parameters * vmulcaddc_parameters,bool linear_activation,bool relu_activation,uint32_t datatype_init_flags,enum xnn_operator_type operator_type,xnn_operator_t * convolution_op_out)61 static enum xnn_status create_convolution2d_nhwc(
62     uint32_t input_padding_top,
63     uint32_t input_padding_right,
64     uint32_t input_padding_bottom,
65     uint32_t input_padding_left,
66     uint32_t kernel_height,
67     uint32_t kernel_width,
68     uint32_t subsampling_height,
69     uint32_t subsampling_width,
70     uint32_t dilation_height,
71     uint32_t dilation_width,
72     uint32_t groups,
73     size_t group_input_channels,
74     size_t group_output_channels,
75     size_t input_channel_stride,
76     size_t output_channel_stride,
77     const void* kernel,
78     const void* bias,
79     uint32_t flags,
80     uint32_t log2_input_element_size,
81     uint32_t log2_filter_element_size,
82     uint32_t bias_element_size,
83     xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w,
84     xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w,
85     xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w,
86     xnn_pack_gemm_goi_w_function pack_gemm_goi_w,
87     xnn_pack_conv_kgo_w_function pack_conv_kgo_w,
88     xnn_pack_conv_goki_w_function pack_conv_goki_w,
89     const void* packing_params,
90     int input_padding_byte,
91     int packed_weights_padding_byte,
92     const void* params,
93     size_t params_size,
94     const struct gemm_parameters* gemm_parameters,
95     const struct dwconv_parameters* dwconv_parameters,
96     size_t num_dwconv_parameters,
97     const struct vmulcaddc_parameters* vmulcaddc_parameters,
98     bool linear_activation,
99     bool relu_activation,
100     uint32_t datatype_init_flags,
101     enum xnn_operator_type operator_type,
102     xnn_operator_t* convolution_op_out)
103 {
104   xnn_operator_t convolution_op = NULL;
105   enum xnn_status status = xnn_status_uninitialized;
106 
107   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
108     xnn_log_error(
109       "failed to create %s operator: XNNPACK is not initialized",
110       xnn_operator_type_to_string(operator_type));
111     goto error;
112   }
113 
114   status = xnn_status_unsupported_hardware;
115 
116   if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
117     xnn_log_error(
118       "failed to create %s operator: operations on data type are not supported",
119       xnn_operator_type_to_string(operator_type));
120     goto error;
121   }
122 
123   status = xnn_status_invalid_parameter;
124 
125   if (kernel_width == 0 || kernel_height == 0) {
126     xnn_log_error(
127       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
128       xnn_operator_type_to_string(operator_type), kernel_width, kernel_height);
129     goto error;
130   }
131 
132   if (subsampling_width == 0 || subsampling_height == 0) {
133     xnn_log_error(
134       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " subsampling: subsampling dimensions must be non-zero",
135       xnn_operator_type_to_string(operator_type), subsampling_width, subsampling_height);
136     goto error;
137   }
138 
139   if (dilation_width == 0 || dilation_height == 0) {
140     xnn_log_error(
141       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
142       xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
143     goto error;
144   }
145 
146   if (groups == 0) {
147     xnn_log_error(
148       "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero",
149       xnn_operator_type_to_string(operator_type), groups);
150     goto error;
151   }
152 
153   if (group_input_channels == 0) {
154     xnn_log_error(
155       "failed to create %s operator with %zu input channels per group: number of channels must be non-zero",
156       xnn_operator_type_to_string(operator_type), group_input_channels);
157     goto error;
158   }
159 
160   if (group_output_channels == 0) {
161     xnn_log_error(
162       "failed to create %s operator with %zu output channels per group: number of channels must be non-zero",
163       xnn_operator_type_to_string(operator_type), group_output_channels);
164     goto error;
165   }
166 
167   const size_t input_channels = groups * group_input_channels;
168   if (input_channel_stride < input_channels) {
169     xnn_log_error(
170       "failed to create %s operator with input channel stride of %zu: "
171       "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
172       xnn_operator_type_to_string(operator_type),
173       input_channel_stride, groups, group_input_channels);
174     goto error;
175   }
176 
177   const size_t output_channels = groups * group_output_channels;
178   if (output_channel_stride < output_channels) {
179     xnn_log_error(
180       "failed to create %s operator with output channel stride of %zu: "
181       "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
182       xnn_operator_type_to_string(operator_type),
183       output_channel_stride, groups, group_output_channels);
184     goto error;
185   }
186 
187   if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
188     xnn_log_error(
189       "failed to create depthwise %s operator with %zu input channels per group: "
190       "depthwise convolution must have exactly 1 input channel per group",
191       xnn_operator_type_to_string(operator_type), group_input_channels);
192     goto error;
193   }
194 
195   const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
196   if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
197     if (any_padding) {
198       xnn_log_error(
199         "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
200         "TensorFlow SAME padding can't be combined with explicit padding specification",
201         xnn_operator_type_to_string(operator_type),
202         input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
203       goto error;
204     }
205   }
206 
207   status = xnn_status_out_of_memory;
208 
209   convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
210   if (convolution_op == NULL) {
211     xnn_log_error(
212       "failed to allocate %zu bytes for %s operator descriptor",
213       sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
214     goto error;
215   }
216 
217   const size_t kernel_size = kernel_height * kernel_width;
218 
219   enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_default;
220   const struct dwconv_parameters* dwconv_ukernel = NULL;
221   const bool unit_subsampling = (subsampling_width | subsampling_height) == 1;
222   if (group_input_channels == 1 && group_output_channels == 1 && kernel_size == 1 && unit_subsampling && !any_padding && vmulcaddc_parameters != NULL) {
223     ukernel_type = xnn_ukernel_type_vmulcaddc;
224   } else if (group_input_channels == 1 && group_output_channels == 1 && (dwconv_ukernel =
225                find_dwconv_ukernel(kernel_size, dwconv_parameters, num_dwconv_parameters)) != NULL)
226   {
227     ukernel_type = xnn_ukernel_type_dwconv;
228   } else if (kernel_size == 1 && unit_subsampling && !any_padding) {
229     ukernel_type = xnn_ukernel_type_gemm;
230   } else {
231     ukernel_type = xnn_ukernel_type_igemm;
232   }
233   assert(ukernel_type != xnn_ukernel_type_default);
234 
235   size_t zero_size = 0;
236   switch (ukernel_type) {
237     case xnn_ukernel_type_vmulcaddc:
238     {
239       assert(vmulcaddc_parameters != NULL);
240 
241       const size_t c_stride = round_up_po2(groups, vmulcaddc_parameters->channel_tile);
242       const size_t packed_weights_size = ((UINT32_C(1) << log2_filter_element_size) + bias_element_size) * c_stride;
243       convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
244       if (convolution_op->packed_weights == NULL) {
245         xnn_log_error(
246           "failed to allocate %zu bytes for %s operator packed weights",
247           packed_weights_size, xnn_operator_type_to_string(operator_type));
248         goto error;
249       }
250 
251       pack_vmulcaddc_w(
252         groups, vmulcaddc_parameters->channel_tile,
253         kernel, bias, convolution_op->packed_weights, packing_params);
254 
255       convolution_op->ukernel.vmulcaddc = (struct xnn_ukernel_vmulcaddc) {
256         .function = vmulcaddc_parameters->ukernel,
257         .mr = vmulcaddc_parameters->row_tile,
258       };
259       break;
260     }
261     case xnn_ukernel_type_dwconv:
262     {
263       assert(dwconv_ukernel != NULL);
264       assert(dwconv_ukernel->primary_tile == kernel_size);
265 
266       const size_t c_stride = round_up_po2(groups, dwconv_ukernel->channel_tile);
267       const size_t packed_weights_size = ((kernel_size << log2_filter_element_size) + bias_element_size) * c_stride;
268       convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
269       if (convolution_op->packed_weights == NULL) {
270         xnn_log_error(
271           "failed to allocate %zu bytes for %s operator packed weights",
272           packed_weights_size, xnn_operator_type_to_string(operator_type));
273         goto error;
274       }
275       memset(convolution_op->packed_weights, packed_weights_padding_byte, packed_weights_size);
276 
277       if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
278         pack_dwconv_hwg_w(
279           kernel_height, kernel_width,
280           groups, dwconv_ukernel->channel_tile,
281           kernel, bias, convolution_op->packed_weights, packing_params);
282       } else {
283         pack_dwconv_ghw_w(
284           kernel_height, kernel_width,
285           groups, dwconv_ukernel->channel_tile,
286           kernel, bias, convolution_op->packed_weights, packing_params);
287       }
288 
289       const union dwconv_fused_ukernels* ukernels = &dwconv_ukernel->minmax;
290       if (linear_activation && dwconv_ukernel->linear.unipass != NULL) {
291         ukernels = &dwconv_ukernel->linear;
292       }
293       convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
294         .unipass_function = ukernels->unipass,
295         .primary_tile = dwconv_ukernel->primary_tile,
296         .incremental_tile = dwconv_ukernel->incremental_tile,
297       };
298 
299       zero_size = XNN_EXTRA_BYTES + (c_stride << log2_input_element_size);
300       break;
301     }
302     case xnn_ukernel_type_gemm:
303     case xnn_ukernel_type_igemm:
304     {
305       const uint32_t nr = gemm_parameters->nr;
306       const uint32_t kr = UINT32_C(1) << gemm_parameters->log2_kr;
307       const uint32_t sr = UINT32_C(1) << gemm_parameters->log2_sr;
308       const size_t n_stride = round_up(group_output_channels, nr);
309       const size_t k_stride = round_up_po2(group_input_channels, kr);
310 
311       const size_t packed_group_weights_size = ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size) * n_stride;
312       convolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
313       if (convolution_op->packed_weights == NULL) {
314         xnn_log_error(
315           "failed to allocate %zu bytes for %s operator packed weights",
316           packed_group_weights_size * groups, xnn_operator_type_to_string(operator_type));
317         goto error;
318       }
319       memset(convolution_op->packed_weights, packed_weights_padding_byte, packed_group_weights_size * groups);
320 
321       const struct gemm_fused_ukernels* gemm_ukernels = &gemm_parameters->minmax;
322       if (linear_activation && gemm_parameters->linear.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
323         gemm_ukernels = &gemm_parameters->linear;
324       } else if (relu_activation && gemm_parameters->relu.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
325         gemm_ukernels = &gemm_parameters->relu;
326       }
327       switch (ukernel_type) {
328         case xnn_ukernel_type_gemm:
329           pack_gemm_goi_w(
330               groups, group_output_channels, group_input_channels,
331               nr, kr, sr,
332               kernel, bias, convolution_op->packed_weights, packing_params);
333           convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
334             .mr = gemm_parameters->mr,
335             .nr = nr,
336             .kr = kr,
337             .general_case = gemm_ukernels->gemm,
338             .mr1_case = gemm_ukernels->gemm1,
339           };
340           break;
341         case xnn_ukernel_type_igemm:
342           if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
343             pack_conv_kgo_w(
344               groups, group_output_channels, kernel_size,
345               nr, kr,
346               kernel, bias, convolution_op->packed_weights, packing_params);
347           } else {
348             pack_conv_goki_w(
349               groups, group_output_channels, kernel_size, group_input_channels,
350               nr, kr, sr,
351               kernel, bias, convolution_op->packed_weights, packing_params);
352           }
353           convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
354             .mr = gemm_parameters->mr,
355             .nr = nr,
356             .kr = kr,
357             .general_case = gemm_ukernels->igemm,
358             .mr1_case = gemm_ukernels->igemm1,
359           };
360           break;
361         default:
362           XNN_UNREACHABLE;
363       }
364 
365       zero_size = XNN_EXTRA_BYTES + (k_stride << log2_input_element_size);
366       break;
367     }
368     default:
369       XNN_UNREACHABLE;
370   }
371 
372   const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1;
373   if (any_padding || tf_same_padding) {
374     convolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size);
375     if (convolution_op->zero_buffer == NULL) {
376       xnn_log_error(
377         "failed to allocate %zu bytes for %s operator zero padding",
378         zero_size, xnn_operator_type_to_string(operator_type));
379       goto error;
380     }
381     memset(convolution_op->zero_buffer, input_padding_byte, zero_size);
382   }
383 
384   convolution_op->padding_top = input_padding_top;
385   convolution_op->padding_right = input_padding_right;
386   convolution_op->padding_bottom = input_padding_bottom;
387   convolution_op->padding_left = input_padding_left;
388 
389   convolution_op->kernel_height = kernel_height;
390   convolution_op->kernel_width = kernel_width;
391   convolution_op->stride_height = subsampling_height;
392   convolution_op->stride_width = subsampling_width;
393   convolution_op->dilation_height = dilation_height;
394   convolution_op->dilation_width = dilation_width;
395   convolution_op->groups = groups;
396   convolution_op->group_input_channels = group_input_channels;
397   convolution_op->group_output_channels = group_output_channels;
398   convolution_op->input_pixel_stride = input_channel_stride;
399   convolution_op->output_pixel_stride = output_channel_stride;
400 
401   memcpy(&convolution_op->params, params, params_size);
402   convolution_op->type = operator_type;
403   convolution_op->ukernel.type = ukernel_type;
404   if (tf_same_padding) {
405     convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
406   }
407 
408   convolution_op->state = xnn_run_state_invalid;
409 
410   *convolution_op_out = convolution_op;
411   return xnn_status_success;
412 
413 error:
414   xnn_delete_operator(convolution_op);
415   return status;
416 }
417 
xnn_create_convolution2d_nhwc_qu8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,uint8_t input_zero_point,float input_scale,uint8_t kernel_zero_point,float kernel_scale,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * convolution_op_out)418 enum xnn_status xnn_create_convolution2d_nhwc_qu8(
419     uint32_t input_padding_top,
420     uint32_t input_padding_right,
421     uint32_t input_padding_bottom,
422     uint32_t input_padding_left,
423     uint32_t kernel_height,
424     uint32_t kernel_width,
425     uint32_t subsampling_height,
426     uint32_t subsampling_width,
427     uint32_t dilation_height,
428     uint32_t dilation_width,
429     uint32_t groups,
430     size_t group_input_channels,
431     size_t group_output_channels,
432     size_t input_channel_stride,
433     size_t output_channel_stride,
434     uint8_t input_zero_point,
435     float input_scale,
436     uint8_t kernel_zero_point,
437     float kernel_scale,
438     const uint8_t* kernel,
439     const int32_t* bias,
440     uint8_t output_zero_point,
441     float output_scale,
442     uint8_t output_min,
443     uint8_t output_max,
444     uint32_t flags,
445     xnn_operator_t* convolution_op_out)
446 {
447   if (input_scale <= 0.0f || !isnormal(input_scale)) {
448     xnn_log_error(
449       "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
450       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), input_scale);
451     return xnn_status_invalid_parameter;
452   }
453 
454   if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
455     xnn_log_error(
456       "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
457       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), kernel_scale);
458     return xnn_status_invalid_parameter;
459   }
460 
461   if (output_scale <= 0.0f || !isnormal(output_scale)) {
462     xnn_log_error(
463       "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
464       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_scale);
465     return xnn_status_invalid_parameter;
466   }
467 
468   if (output_min >= output_max) {
469     xnn_log_error(
470       "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
471       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_min, output_max);
472     return xnn_status_invalid_parameter;
473   }
474 
475   const float requantization_scale = input_scale * kernel_scale / output_scale;
476   if (requantization_scale >= 1.0f) {
477     xnn_log_error(
478       "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
479       "requantization scale %.7g is greater or equal to 1.0",
480       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8),
481       input_scale, kernel_scale, output_scale, requantization_scale);
482     return xnn_status_unsupported_parameter;
483   }
484 
485   const struct xnn_qu8_packing_params packing_params = {
486     .input_zero_point = input_zero_point,
487     .kernel_zero_point = kernel_zero_point,
488   };
489   const union xnn_qu8_gemm_params params = xnn_init_qu8_gemm_params(
490     kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
491   return create_convolution2d_nhwc(
492     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
493     kernel_height, kernel_width,
494     subsampling_height, subsampling_width,
495     dilation_height, dilation_width,
496     groups, group_input_channels, group_output_channels,
497     input_channel_stride, output_channel_stride,
498     kernel, bias, flags,
499     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
500     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
501     sizeof(int32_t) /* sizeof(bias element) */,
502     (xnn_pack_vmulcaddc_w_function) NULL,
503     (xnn_pack_dwconv_hwg_w_function) xnn_pack_qu8_dwconv_hwg_w,
504     (xnn_pack_dwconv_ghw_w_function) xnn_pack_qu8_dwconv_ghw_w,
505     (xnn_pack_gemm_goi_w_function) xnn_pack_qu8_gemm_goi_w,
506     (xnn_pack_conv_kgo_w_function) xnn_pack_qu8_conv_kgo_w,
507     (xnn_pack_conv_goki_w_function) xnn_pack_qu8_conv_goki_w,
508     &packing_params, input_zero_point /* input padding byte */, kernel_zero_point /* packed weights padding byte */,
509     &params, sizeof(params),
510     &xnn_params.qu8.gemm, xnn_params.qu8.dwconv, XNN_MAX_QU8_DWCONV_UKERNELS, NULL /* vmulcaddc parameters */,
511     false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QU8,
512     xnn_operator_type_convolution_nhwc_qu8,
513     convolution_op_out);
514 }
515 
xnn_create_convolution2d_nhwc_qs8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,int8_t input_zero_point,float input_scale,float kernel_scale,const int8_t * kernel,const int32_t * bias,int8_t output_zero_point,float output_scale,int8_t output_min,int8_t output_max,uint32_t flags,xnn_operator_t * convolution_op_out)516 enum xnn_status xnn_create_convolution2d_nhwc_qs8(
517     uint32_t input_padding_top,
518     uint32_t input_padding_right,
519     uint32_t input_padding_bottom,
520     uint32_t input_padding_left,
521     uint32_t kernel_height,
522     uint32_t kernel_width,
523     uint32_t subsampling_height,
524     uint32_t subsampling_width,
525     uint32_t dilation_height,
526     uint32_t dilation_width,
527     uint32_t groups,
528     size_t group_input_channels,
529     size_t group_output_channels,
530     size_t input_channel_stride,
531     size_t output_channel_stride,
532     int8_t input_zero_point,
533     float input_scale,
534     float kernel_scale,
535     const int8_t* kernel,
536     const int32_t* bias,
537     int8_t output_zero_point,
538     float output_scale,
539     int8_t output_min,
540     int8_t output_max,
541     uint32_t flags,
542     xnn_operator_t* convolution_op_out)
543 {
544   if (input_scale <= 0.0f || !isnormal(input_scale)) {
545     xnn_log_error(
546       "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
547       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), input_scale);
548     return xnn_status_invalid_parameter;
549   }
550 
551   if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
552     xnn_log_error(
553       "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
554       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), kernel_scale);
555     return xnn_status_invalid_parameter;
556   }
557 
558   if (output_scale <= 0.0f || !isnormal(output_scale)) {
559     xnn_log_error(
560       "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
561       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_scale);
562     return xnn_status_invalid_parameter;
563   }
564 
565   if (output_min >= output_max) {
566     xnn_log_error(
567       "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
568       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_min, output_max);
569     return xnn_status_invalid_parameter;
570   }
571 
572   const float requantization_scale = input_scale * kernel_scale / output_scale;
573   if (requantization_scale >= 1.0f) {
574     xnn_log_error(
575       "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
576       "requantization scale %.7g is greater or equal to 1.0",
577       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8),
578       input_scale, kernel_scale, output_scale, requantization_scale);
579     return xnn_status_unsupported_parameter;
580   }
581 
582   const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, };
583   const union xnn_qs8_gemm_params params = xnn_init_qs8_gemm_params(
584     requantization_scale, output_zero_point, output_min, output_max);
585   return create_convolution2d_nhwc(
586     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
587     kernel_height, kernel_width,
588     subsampling_height, subsampling_width,
589     dilation_height, dilation_width,
590     groups, group_input_channels, group_output_channels,
591     input_channel_stride, output_channel_stride,
592     kernel, bias, flags,
593     0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
594     0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
595     sizeof(int32_t) /* sizeof(bias element) */,
596     (xnn_pack_vmulcaddc_w_function) NULL,
597     (xnn_pack_dwconv_hwg_w_function) xnn_pack_qs8_dwconv_hwg_w,
598     (xnn_pack_dwconv_ghw_w_function) xnn_pack_qs8_dwconv_ghw_w,
599     (xnn_pack_gemm_goi_w_function) xnn_pack_qs8_gemm_goi_w,
600     (xnn_pack_conv_kgo_w_function) xnn_pack_qs8_conv_kgo_w,
601     (xnn_pack_conv_goki_w_function) xnn_pack_qs8_conv_goki_w,
602     &packing_params, input_zero_point /* input padding byte */, 0 /* packed weights padding byte */,
603     &params, sizeof(params),
604     &xnn_params.qs8.gemm, xnn_params.qs8.dwconv, XNN_MAX_QS8_DWCONV_UKERNELS, NULL /* vmulcaddc parameters */,
605     false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QS8,
606     xnn_operator_type_convolution_nhwc_qs8,
607     convolution_op_out);
608 }
609 
xnn_create_convolution2d_nhwc_f16(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const void * kernel,const void * bias,float output_min,float output_max,uint32_t flags,xnn_operator_t * convolution_op_out)610 enum xnn_status xnn_create_convolution2d_nhwc_f16(
611     uint32_t input_padding_top,
612     uint32_t input_padding_right,
613     uint32_t input_padding_bottom,
614     uint32_t input_padding_left,
615     uint32_t kernel_height,
616     uint32_t kernel_width,
617     uint32_t subsampling_height,
618     uint32_t subsampling_width,
619     uint32_t dilation_height,
620     uint32_t dilation_width,
621     uint32_t groups,
622     size_t group_input_channels,
623     size_t group_output_channels,
624     size_t input_channel_stride,
625     size_t output_channel_stride,
626     const void* kernel,
627     const void* bias,
628     float output_min,
629     float output_max,
630     uint32_t flags,
631     xnn_operator_t* convolution_op_out)
632 {
633   if (isnan(output_min)) {
634     xnn_log_error(
635       "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
636       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
637     return xnn_status_invalid_parameter;
638   }
639 
640   if (isnan(output_max)) {
641     xnn_log_error(
642       "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
643       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
644     return xnn_status_invalid_parameter;
645   }
646 
647   const uint16_t fp16_output_min = fp16_ieee_from_fp32_value(output_min);
648   const uint16_t fp16_output_max = fp16_ieee_from_fp32_value(output_max);
649   const float rounded_output_min = fp16_ieee_to_fp32_value(fp16_output_min);
650   const float rounded_output_max = fp16_ieee_to_fp32_value(fp16_output_max);
651   if (rounded_output_min >= rounded_output_max) {
652     xnn_log_error(
653       "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
654       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16), rounded_output_min, rounded_output_max);
655     return xnn_status_invalid_parameter;
656   }
657 
658   const struct {
659     struct xnn_f16_minmax_params minmax;
660     struct xnn_f16_scaleminmax_params scaleminmax;
661   } params = {
662     .minmax = xnn_init_f16_minmax_params(fp16_output_min, fp16_output_max),
663     .scaleminmax = xnn_init_f16_scaleminmax_params(
664         UINT16_C(0x3C00) /* 1.0 */, fp16_output_min, fp16_output_max),
665   };
666   return create_convolution2d_nhwc(
667     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
668     kernel_height, kernel_width,
669     subsampling_height, subsampling_width,
670     dilation_height, dilation_width,
671     groups, group_input_channels, group_output_channels,
672     input_channel_stride, output_channel_stride,
673     kernel, bias, flags,
674     1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
675     1 /* log2(sizeof(filter element)) = log2(sizeof(uint16_t)) */,
676     sizeof(uint16_t) /* sizeof(bias element) */,
677     (xnn_pack_vmulcaddc_w_function) xnn_pack_f16_vmulcaddc_w,
678     (xnn_pack_dwconv_hwg_w_function) xnn_pack_f16_dwconv_hwg_w,
679     (xnn_pack_dwconv_ghw_w_function) xnn_pack_f16_dwconv_ghw_w,
680     (xnn_pack_gemm_goi_w_function) xnn_pack_f16_gemm_goi_w,
681     (xnn_pack_conv_kgo_w_function) xnn_pack_f16_conv_kgo_w,
682     (xnn_pack_conv_goki_w_function) xnn_pack_f16_conv_goki_w,
683     NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
684     &params, sizeof(params),
685     &xnn_params.f16.gemm, xnn_params.f16.dwconv, XNN_MAX_F16_DWCONV_UKERNELS, &xnn_params.f16.vmulcaddc,
686     false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_F16,
687     xnn_operator_type_convolution_nhwc_f16,
688     convolution_op_out);
689 }
690 
xnn_create_convolution2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const float * kernel,const float * bias,float output_min,float output_max,uint32_t flags,xnn_operator_t * convolution_op_out)691 enum xnn_status xnn_create_convolution2d_nhwc_f32(
692     uint32_t input_padding_top,
693     uint32_t input_padding_right,
694     uint32_t input_padding_bottom,
695     uint32_t input_padding_left,
696     uint32_t kernel_height,
697     uint32_t kernel_width,
698     uint32_t subsampling_height,
699     uint32_t subsampling_width,
700     uint32_t dilation_height,
701     uint32_t dilation_width,
702     uint32_t groups,
703     size_t group_input_channels,
704     size_t group_output_channels,
705     size_t input_channel_stride,
706     size_t output_channel_stride,
707     const float* kernel,
708     const float* bias,
709     float output_min,
710     float output_max,
711     uint32_t flags,
712     xnn_operator_t* convolution_op_out)
713 {
714   if (isnan(output_min)) {
715     xnn_log_error(
716       "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
717       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
718     return xnn_status_invalid_parameter;
719   }
720 
721   if (isnan(output_max)) {
722     xnn_log_error(
723       "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
724       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
725     return xnn_status_invalid_parameter;
726   }
727 
728   if (output_min >= output_max) {
729     xnn_log_error(
730       "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
731       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32), output_min, output_max);
732     return xnn_status_invalid_parameter;
733   }
734 
735   const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
736   const bool relu_activation = (output_max == INFINITY) && (output_min == 0.0f);
737   const union xnn_f32_minmax_params params = xnn_init_f32_minmax_params(output_min, output_max);
738   return create_convolution2d_nhwc(
739     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
740     kernel_height, kernel_width,
741     subsampling_height, subsampling_width,
742     dilation_height, dilation_width,
743     groups, group_input_channels, group_output_channels,
744     input_channel_stride, output_channel_stride,
745     kernel, bias, flags,
746     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
747     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
748     sizeof(float) /* sizeof(bias element) */,
749     (xnn_pack_vmulcaddc_w_function) xnn_pack_f32_vmulcaddc_w,
750     (xnn_pack_dwconv_hwg_w_function) xnn_pack_f32_dwconv_hwg_w,
751     (xnn_pack_dwconv_ghw_w_function) xnn_pack_f32_dwconv_ghw_w,
752     (xnn_pack_gemm_goi_w_function) xnn_pack_f32_gemm_goi_w,
753     (xnn_pack_conv_kgo_w_function) xnn_pack_f32_conv_kgo_w,
754     (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
755     NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
756     &params, sizeof(params),
757     &xnn_params.f32.gemm, xnn_params.f32.dwconv, XNN_MAX_F32_DWCONV_UKERNELS, &xnn_params.f32.vmulcaddc,
758     linear_activation, relu_activation, XNN_INIT_FLAG_F32,
759     xnn_operator_type_convolution_nhwc_f32,
760     convolution_op_out);
761 }
762 
setup_convolution2d_nhwc(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,uint32_t datatype_init_flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * gemm_params,const void * dwconv_params,size_t num_threads)763 static enum xnn_status setup_convolution2d_nhwc(
764   xnn_operator_t convolution_op,
765   size_t batch_size,
766   size_t input_height,
767   size_t input_width,
768   const void* input,
769   void* output,
770   uint32_t datatype_init_flags,
771   uint32_t log2_input_element_size,
772   uint32_t log2_filter_element_size,
773   uint32_t bias_element_size,
774   uint32_t log2_output_element_size,
775   const void* gemm_params,
776   const void* dwconv_params,
777   size_t num_threads)
778 {
779   convolution_op->state = xnn_run_state_invalid;
780 
781   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
782     xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
783       xnn_operator_type_to_string(convolution_op->type));
784     return xnn_status_uninitialized;
785   }
786 
787   if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
788     xnn_log_error(
789       "failed to create %s operator: operations on data type are not supported",
790       xnn_operator_type_to_string(convolution_op->type));
791     return xnn_status_unsupported_hardware;
792   }
793 
794   if (input_width == 0 || input_height == 0) {
795     xnn_log_error(
796       "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
797       xnn_operator_type_to_string(convolution_op->type), input_width, input_height);
798     return xnn_status_invalid_parameter;
799   }
800 
801   if (batch_size == 0) {
802     convolution_op->state = xnn_run_state_skip;
803     return xnn_status_success;
804   }
805 
806   convolution_op->batch_size = batch_size;
807   convolution_op->input_height = input_height;
808   convolution_op->input_width = input_width;
809   convolution_op->input = input;
810 
811   if (convolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
812     convolution_op->output_height = compute_output_dimension_with_tf_same_padding(
813         input_height, convolution_op->stride_height);
814     convolution_op->output_width = compute_output_dimension_with_tf_same_padding(
815         input_width, convolution_op->stride_width);
816 
817     const uint32_t effective_kernel_height = (convolution_op->kernel_height - 1) * convolution_op->dilation_height + 1;
818     const uint32_t effective_kernel_width = (convolution_op->kernel_width - 1) * convolution_op->dilation_width + 1;
819     const size_t total_padding_height =
820       (convolution_op->output_height - 1) * convolution_op->stride_height + effective_kernel_height - input_height;
821     const size_t total_padding_width =
822       (convolution_op->output_width - 1) * convolution_op->stride_width + effective_kernel_width - input_width;
823     convolution_op->padding_top = total_padding_height / 2;
824     convolution_op->padding_left = total_padding_width / 2;
825     convolution_op->padding_bottom = total_padding_height - convolution_op->padding_top;
826     convolution_op->padding_right = total_padding_width - convolution_op->padding_left;
827   } else {
828     convolution_op->output_height = compute_output_dimension(
829         convolution_op->padding_top + input_height + convolution_op->padding_bottom,
830         convolution_op->kernel_height,
831         convolution_op->dilation_height,
832         convolution_op->stride_height);
833     convolution_op->output_width = compute_output_dimension(
834         convolution_op->padding_left + input_width + convolution_op->padding_right,
835         convolution_op->kernel_width,
836         convolution_op->dilation_width,
837         convolution_op->stride_width);
838   }
839   convolution_op->output = output;
840 
841   switch (convolution_op->ukernel.type) {
842     case xnn_ukernel_type_gemm:
843     {
844       // Convolution maps directly to GEMM and doesn't use indirection buffer.
845 
846       const size_t output_height = convolution_op->output_height;
847       const size_t output_width = convolution_op->output_width;
848       const size_t output_size = output_height * output_width;
849       const size_t batch_output_size = batch_size * output_size;
850 
851       const size_t groups = convolution_op->groups;
852       const size_t group_input_channels = convolution_op->group_input_channels;
853       const size_t w_stride = bias_element_size +
854         (round_up_po2(group_input_channels, convolution_op->ukernel.gemm.kr) << log2_filter_element_size);
855       const size_t group_output_channels = convolution_op->group_output_channels;
856 
857       uint32_t mr = convolution_op->ukernel.gemm.mr;
858       const uint32_t nr = convolution_op->ukernel.gemm.nr;
859       struct xnn_hmp_gemm_ukernel gemm_ukernel = convolution_op->ukernel.gemm.general_case;
860       if (batch_output_size == 1 && convolution_op->ukernel.gemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
861         mr = 1;
862         gemm_ukernel = convolution_op->ukernel.gemm.mr1_case;
863       }
864 
865       convolution_op->context.gemm = (struct gemm_context) {
866           .k_scaled = group_input_channels << log2_input_element_size,
867           .a = input,
868           .a_stride = convolution_op->input_pixel_stride << log2_input_element_size,
869           .packed_w = convolution_op->packed_weights,
870           .w_stride = w_stride,
871           .wg_stride = w_stride * round_up(group_output_channels, nr),
872           .c = output,
873           .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
874           .cn_stride = nr << log2_output_element_size,
875           .cg_stride = group_output_channels << log2_output_element_size,
876           .log2_csize = log2_output_element_size,
877           .ukernel = gemm_ukernel,
878       };
879       memcpy(&convolution_op->context.gemm.params, gemm_params, sizeof(convolution_op->context.gemm.params));
880 
881       size_t nc = group_output_channels;
882       if (num_threads > 1) {
883         const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr);
884         const size_t target_tiles_per_thread = 5;
885         const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
886         if (max_nc < nc) {
887           nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
888         }
889       }
890       if (groups == 1) {
891         #if XNN_MAX_UARCH_TYPES > 1
892           if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
893             convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d_with_uarch;
894             convolution_op->compute.task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_gemm;
895           } else {
896             convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
897             convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
898           }
899         #else
900           convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
901           convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
902         #endif
903         convolution_op->compute.range[0] = batch_output_size;
904         convolution_op->compute.range[1] = group_output_channels;
905         convolution_op->compute.tile[0] = mr;
906         convolution_op->compute.tile[1] = nc;
907       } else {
908         #if XNN_MAX_UARCH_TYPES > 1
909           if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
910             convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
911             convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_gemm;
912           } else {
913             convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
914             convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
915           }
916         #else
917           convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
918           convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
919         #endif
920         convolution_op->compute.range[0] = groups;
921         convolution_op->compute.range[1] = batch_output_size;
922         convolution_op->compute.range[2] = group_output_channels;
923         convolution_op->compute.tile[0] = mr;
924         convolution_op->compute.tile[1] = nc;
925       }
926       convolution_op->state = xnn_run_state_ready;
927 
928       return xnn_status_success;
929     }
930     case xnn_ukernel_type_igemm:
931     {
932       const size_t groups = convolution_op->groups;
933       const size_t kernel_height = convolution_op->kernel_height;
934       const size_t kernel_width = convolution_op->kernel_width;
935       const size_t kernel_size = kernel_height * kernel_width;
936       const size_t output_height = convolution_op->output_height;
937       const size_t output_width = convolution_op->output_width;
938       const size_t output_size = output_height * output_width;
939 
940       uint32_t mr = convolution_op->ukernel.igemm.mr;
941       const uint32_t nr = convolution_op->ukernel.igemm.nr;
942       struct xnn_hmp_igemm_ukernel igemm_ukernel = convolution_op->ukernel.igemm.general_case;
943       if (output_size == 1 && convolution_op->ukernel.igemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
944         mr = 1;
945         igemm_ukernel = convolution_op->ukernel.igemm.mr1_case;
946       }
947 
948       const size_t tiled_output_size = round_up(output_size, mr);
949       const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
950 
951       if (input_height != convolution_op->last_input_height ||
952           input_width != convolution_op->last_input_width)
953       {
954         const void** indirection_buffer = (const void**) xnn_reallocate_memory((void*) convolution_op->indirection_buffer, indirection_buffer_size);
955         if (indirection_buffer == NULL) {
956           xnn_log_error(
957             "failed to allocate %zu bytes for %s operator indirection buffer",
958             indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
959           return xnn_status_out_of_memory;
960         }
961         convolution_op->indirection_buffer = indirection_buffer;
962         convolution_op->last_input = input;
963         convolution_op->last_input_height = input_height;
964         convolution_op->last_input_width = input_width;
965 
966         xnn_indirection_init_conv2d(convolution_op, mr, log2_input_element_size);
967       }
968 
969       const size_t group_input_channels = convolution_op->group_input_channels;
970       const size_t w_stride = (round_up_po2(group_input_channels, convolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size) + bias_element_size;
971       const size_t group_output_channels = convolution_op->group_output_channels;
972       convolution_op->context.igemm = (struct igemm_context) {
973           .ks = kernel_size,
974           .ks_scaled = kernel_size * mr * sizeof(void*),
975           .kc = group_input_channels << log2_input_element_size,
976           .w_stride = w_stride,
977           .indirect_a = convolution_op->indirection_buffer,
978           .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
979           .zero = convolution_op->zero_buffer,
980           .packed_w = convolution_op->packed_weights,
981           .c = convolution_op->output,
982           .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
983           .cn_stride = nr << log2_output_element_size,
984           .ga_stride = group_input_channels << log2_input_element_size,
985           .gw_stride = w_stride * round_up(group_output_channels, nr),
986           .gc_stride = group_output_channels << log2_output_element_size,
987           .ba_stride = input_height * input_width * convolution_op->input_pixel_stride << log2_input_element_size,
988           .bc_stride = output_size * convolution_op->output_pixel_stride << log2_output_element_size,
989           .log2_csize = log2_output_element_size,
990           .ukernel = igemm_ukernel,
991       };
992       memcpy(&convolution_op->context.igemm.params, gemm_params, sizeof(convolution_op->context.igemm.params));
993 
994       size_t nc = group_output_channels;
995       if (num_threads > 1) {
996         const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
997         const size_t target_tiles_per_thread = 5;
998         const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
999         if (max_nc < nc) {
1000           nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
1001         }
1002       }
1003       if (groups == 1) {
1004         #if XNN_MAX_UARCH_TYPES > 1
1005           if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1006             if (batch_size > 1) {
1007               convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1008               convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_batch_hmp_igemm;
1009             } else {
1010               convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d_with_uarch;
1011               convolution_op->compute.task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_igemm;
1012             }
1013           } else {
1014             if (batch_size > 1) {
1015               convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1016               convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1017             } else {
1018               convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1019               convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1020             }
1021           }
1022         #else
1023           if (batch_size > 1) {
1024             convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1025             convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1026           } else {
1027             convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1028             convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1029           }
1030         #endif
1031         if (batch_size > 1) {
1032           convolution_op->compute.range[0] = batch_size;
1033           convolution_op->compute.range[1] = output_size;
1034           convolution_op->compute.range[2] = group_output_channels;
1035         } else {
1036           convolution_op->compute.range[0] = output_size;
1037           convolution_op->compute.range[1] = group_output_channels;
1038         }
1039         convolution_op->compute.tile[0] = mr;
1040         convolution_op->compute.tile[1] = nc;
1041       } else {
1042         #if XNN_MAX_UARCH_TYPES > 1
1043           if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1044             if (batch_size > 1) {
1045               convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d_with_uarch;
1046               convolution_op->compute.task_4d_tile_2d_with_id = (pthreadpool_task_4d_tile_2d_with_id_t) xnn_compute_hmp_grouped_batch_igemm;
1047             } else {
1048               convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1049               convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_igemm;
1050             }
1051           } else {
1052             if (batch_size > 1) {
1053               convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
1054               convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1055             } else {
1056               convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1057               convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1058             }
1059           }
1060         #else
1061           if (batch_size > 1) {
1062             convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
1063             convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1064           } else {
1065             convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1066             convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1067           }
1068         #endif
1069         if (batch_size > 1) {
1070           convolution_op->compute.range[0] = batch_size;
1071           convolution_op->compute.range[1] = groups;
1072           convolution_op->compute.range[2] = output_size;
1073           convolution_op->compute.range[3] = group_output_channels;
1074         } else {
1075           convolution_op->compute.range[0] = groups;
1076           convolution_op->compute.range[1] = output_size;
1077           convolution_op->compute.range[2] = group_output_channels;
1078         }
1079         convolution_op->compute.tile[0] = mr;
1080         convolution_op->compute.tile[1] = nc;
1081       }
1082       convolution_op->state = xnn_run_state_ready;
1083 
1084       return xnn_status_success;
1085     }
1086     case xnn_ukernel_type_dwconv:
1087     {
1088       const size_t kernel_height = convolution_op->kernel_height;
1089       const size_t kernel_width = convolution_op->kernel_width;
1090       const size_t kernel_size = kernel_height * kernel_width;
1091       const size_t output_height = convolution_op->output_height;
1092       const size_t output_width = convolution_op->output_width;
1093       const size_t step_width = convolution_op->dilation_width == 1 ? convolution_op->stride_width : kernel_width;
1094       const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
1095       if (input_height != convolution_op->last_input_height || input_width != convolution_op->last_input_width) {
1096         const size_t indirection_buffer_size = sizeof(void*) * output_height * step_height;
1097 
1098         const void** indirection_buffer =
1099           (const void**) xnn_reallocate_memory(convolution_op->indirection_buffer, indirection_buffer_size);
1100         if (indirection_buffer == NULL) {
1101           xnn_log_error("failed to allocate %zu bytes for %s operator indirection buffer",
1102             indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
1103           return xnn_status_out_of_memory;
1104         }
1105         convolution_op->indirection_buffer = indirection_buffer;
1106 
1107         xnn_indirection_init_dwconv2d(convolution_op, step_height, step_width, log2_input_element_size);
1108 
1109         convolution_op->last_input = input;
1110         convolution_op->last_input_height = input_height;
1111         convolution_op->last_input_width = input_width;
1112       }
1113 
1114       const size_t groups = convolution_op->groups;
1115       convolution_op->context.dwconv = (struct dwconv_context) {
1116           .indirect_input = convolution_op->indirection_buffer,
1117           .indirect_input_width_stride = kernel_height * step_width * sizeof(void*),
1118           .indirect_input_height_stride = step_height * sizeof(void*),
1119           .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
1120           .input_batch_stride = (input_height * input_width * convolution_op->input_pixel_stride) << log2_input_element_size,
1121           .packed_weights = convolution_op->packed_weights,
1122           .output = convolution_op->output,
1123           .output_batch_stride = (output_height * output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
1124           .output_height_stride = (output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
1125           .output_width = output_width,
1126           .groups = groups,
1127           .zero = convolution_op->zero_buffer,
1128           .output_increment = (convolution_op->output_pixel_stride - groups) << log2_output_element_size,
1129           .unipass_ukernel = convolution_op->ukernel.dwconv.unipass_function,
1130       };
1131       memcpy(&convolution_op->context.dwconv.params, dwconv_params, sizeof(convolution_op->context.dwconv.params));
1132 
1133       convolution_op->compute.type = xnn_parallelization_type_2d;
1134       convolution_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_dwconv_unipass;
1135       convolution_op->compute.range[0] = batch_size;
1136       convolution_op->compute.range[1] = output_height;
1137       convolution_op->state = xnn_run_state_ready;
1138 
1139       return xnn_status_success;
1140     }
1141     case xnn_ukernel_type_vmulcaddc:
1142     {
1143       const size_t batch_output_size = batch_size * convolution_op->output_height * convolution_op->output_width;
1144 
1145       convolution_op->context.vmulcaddc = (struct vmulcaddc_context) {
1146           .n = convolution_op->groups << log2_input_element_size,
1147           .x = input,
1148           .x_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1149           .w = convolution_op->packed_weights,
1150           .y = output,
1151           .y_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1152           .ukernel = convolution_op->ukernel.vmulcaddc.function,
1153       };
1154       memcpy(&convolution_op->context.vmulcaddc.params, dwconv_params, sizeof(convolution_op->context.vmulcaddc.params));
1155 
1156       size_t mc = batch_output_size;
1157       if (num_threads > 1) {
1158         const size_t target_tiles_per_thread = 5;
1159         const size_t max_mc = divide_round_up(batch_output_size, num_threads * target_tiles_per_thread);
1160         if (max_mc < mc) {
1161           const uint32_t mr = convolution_op->ukernel.vmulcaddc.mr;
1162           mc = min(mc, divide_round_up(mc, max_mc * mr) * mr);
1163         }
1164       }
1165       convolution_op->compute.type = xnn_parallelization_type_1d_tile_1d;
1166       convolution_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_vmulcaddc;
1167       convolution_op->compute.range[0] = batch_output_size;
1168       convolution_op->compute.tile[0] = mc;
1169       convolution_op->state = xnn_run_state_ready;
1170 
1171       return xnn_status_success;
1172     }
1173     default:
1174       XNN_UNREACHABLE;
1175   }
1176 }
1177 
xnn_setup_convolution2d_nhwc_qu8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)1178 enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
1179     xnn_operator_t convolution_op,
1180     size_t batch_size,
1181     size_t input_height,
1182     size_t input_width,
1183     const uint8_t* input,
1184     uint8_t* output,
1185     pthreadpool_t threadpool)
1186 {
1187   if (convolution_op->type != xnn_operator_type_convolution_nhwc_qu8) {
1188     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1189       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8),
1190       xnn_operator_type_to_string(convolution_op->type));
1191     return xnn_status_invalid_parameter;
1192   }
1193 
1194   return setup_convolution2d_nhwc(
1195     convolution_op,
1196     batch_size, input_height, input_width,
1197     input, output,
1198     XNN_INIT_FLAG_QU8,
1199     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
1200     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
1201     sizeof(int32_t) /* sizeof(bias element) */,
1202     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
1203     &convolution_op->params.qu8_gemm,
1204     &convolution_op->params.qu8_gemm,
1205     pthreadpool_get_threads_count(threadpool));
1206 }
1207 
xnn_setup_convolution2d_nhwc_qs8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const int8_t * input,int8_t * output,pthreadpool_t threadpool)1208 enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
1209     xnn_operator_t convolution_op,
1210     size_t batch_size,
1211     size_t input_height,
1212     size_t input_width,
1213     const int8_t* input,
1214     int8_t* output,
1215     pthreadpool_t threadpool)
1216 {
1217   if (convolution_op->type != xnn_operator_type_convolution_nhwc_qs8) {
1218     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1219       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8),
1220       xnn_operator_type_to_string(convolution_op->type));
1221     return xnn_status_invalid_parameter;
1222   }
1223 
1224   return setup_convolution2d_nhwc(
1225     convolution_op,
1226     batch_size, input_height, input_width,
1227     input, output,
1228     XNN_INIT_FLAG_QS8,
1229     0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
1230     0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
1231     sizeof(int32_t) /* sizeof(bias element) */,
1232     0 /* log2(sizeof(output element)) = log2(sizeof(int8_t)) */,
1233     &convolution_op->params.qs8_gemm,
1234     &convolution_op->params.qs8_gemm,
1235     pthreadpool_get_threads_count(threadpool));
1236 }
1237 
xnn_setup_convolution2d_nhwc_f16(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)1238 enum xnn_status xnn_setup_convolution2d_nhwc_f16(
1239     xnn_operator_t convolution_op,
1240     size_t batch_size,
1241     size_t input_height,
1242     size_t input_width,
1243     const void* input,
1244     void* output,
1245     pthreadpool_t threadpool)
1246 {
1247   if (convolution_op->type != xnn_operator_type_convolution_nhwc_f16) {
1248     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1249       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16),
1250       xnn_operator_type_to_string(convolution_op->type));
1251     return xnn_status_invalid_parameter;
1252   }
1253 
1254   return setup_convolution2d_nhwc(
1255     convolution_op,
1256     batch_size, input_height, input_width,
1257     input, output,
1258     XNN_INIT_FLAG_F16,
1259     1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
1260     1 /* log2(sizeof(filter element)) = log2(sizeof(uint16_t)) */,
1261     sizeof(uint16_t) /* sizeof(bias element) */,
1262     1 /* log2(sizeof(output element)) = log2(sizeof(uint16_t)) */,
1263     &convolution_op->params.f16_scaleminmax,
1264     &convolution_op->params.f16_minmax,
1265     pthreadpool_get_threads_count(threadpool));
1266 }
1267 
xnn_setup_convolution2d_nhwc_f32(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const float * input,float * output,pthreadpool_t threadpool)1268 enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1269     xnn_operator_t convolution_op,
1270     size_t batch_size,
1271     size_t input_height,
1272     size_t input_width,
1273     const float* input,
1274     float* output,
1275     pthreadpool_t threadpool)
1276 {
1277   if (convolution_op->type != xnn_operator_type_convolution_nhwc_f32) {
1278     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1279       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32),
1280       xnn_operator_type_to_string(convolution_op->type));
1281     return xnn_status_invalid_parameter;
1282   }
1283 
1284   return setup_convolution2d_nhwc(
1285     convolution_op,
1286     batch_size, input_height, input_width,
1287     input, output,
1288     XNN_INIT_FLAG_F32,
1289     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1290     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1291     sizeof(float) /* sizeof(bias element) */,
1292     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
1293     &convolution_op->params.f32_minmax,
1294     &convolution_op->params.f32_minmax,
1295     pthreadpool_get_threads_count(threadpool));
1296 }
1297