1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 
15 #include <pthreadpool.h>
16 
17 #ifdef __cplusplus
18 extern "C" {
19 #endif
20 
21 /// The number of bytes XNNPACK may read beyond array bounds.
22 /// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
23 ///
24 /// Note: XNNPACK reads, but never writes beyond array bounds.
25 #define XNN_EXTRA_BYTES 16
26 
27 /// Maximum number of dimensions in tensor shape.
28 #define XNN_MAX_TENSOR_DIMS 6
29 
30 /// Allow sparse inference in a Runtime.
31 ///
32 /// Note: this flag forces XNNPACK to consider sparse inference, but does not guarantee it.
33 #define XNN_FLAG_SPARSE_INFERENCE 0x00000001
34 
35 /// The convolution operator represents a depthwise convolution, and use HWGo layout for filters.
36 #define XNN_FLAG_DEPTHWISE_CONVOLUTION 0x00000001
37 
38 /// Assume transposed weights in a fully connected operator.
39 #define XNN_FLAG_TRANSPOSE_WEIGHTS 0x00000001
40 
41 /// The operator assumes NHWC layout for the input, regardless of the output layout.
42 #define XNN_FLAG_INPUT_NHWC 0x00000002
43 
44 /// Match "SAME" padding in TensorFlow. Exact padding values are computed dynamically depending on input size.
45 #define XNN_FLAG_TENSORFLOW_SAME_PADDING 0x00000004
46 
47 /// Implicitly flatten and reshape input of a Fully Connected operator into a 2D
48 /// tensor.
49 #define XNN_FLAG_TENSORFLOW_RESHAPE_2D 0x00000004
50 
51 /// Match behaviour of TensorFlow 1.x.
52 #define XNN_FLAG_TENSORFLOW_LEGACY_MODE 0x00000004
53 
54 /// Align corners of input and output images in resize operations.
55 #define XNN_FLAG_ALIGN_CORNERS 0x00000008
56 
57 /// Status code for any XNNPACK function call.
58 enum xnn_status {
59   /// The call succeeded, and all output arguments now contain valid data.
60   xnn_status_success = 0,
61   xnn_status_uninitialized = 1,
62   xnn_status_invalid_parameter = 2,
63   xnn_status_invalid_state = 3,
64   xnn_status_unsupported_parameter = 4,
65   xnn_status_unsupported_hardware = 5,
66   xnn_status_out_of_memory = 6,
67 };
68 
69 struct xnn_allocator {
70   /// User-specified pointer that will be passed as-is to all functions in this structure.
71   void* context;
72   /// Pointer to a function to be called for general memory allocation.
73   ///
74   /// @param context - The user-specified pointer from xnn_allocator structure.
75   /// @param size - The size of the memory block to allocate, in bytes.
76   ///
77   /// @returns Pointer to the allocated memory block of at least @ref size bytes.
78   ///          If allocation fails, the function must return NULL.
79   void* (*allocate)(void* context, size_t size);
80   /// Pointer to a function to be called for general memory re-allocation, i.e. to increase or shrink a previously
81   /// allocated memory block. The content of the old memory block is copied to the new memory block.
82   ///
83   /// @param context - The user-specified pointer from xnn_allocator structure.
84   /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
85   ///                  If the pointer is NULL, the @ref reallocate call is equivalent to an @ref allocate call.
86   /// @param size - The new size of the memory block to allocate, in bytes.
87   ///
88   /// @returns Pointer to the newly allocated memory block of at least @ref size bytes with the content of the previous
89   ///          memory block.
90   ///          If allocation fails, the function must return NULL, but must not release the previous memory block.
91   void* (*reallocate)(void* context, void* pointer, size_t size);
92   /// Pointer to a function to be called for general memory de-allocation.
93   ///
94   /// @param context - The user-specified pointer from xnn_allocator structure.
95   /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
96   ///                  If the pointer is NULL, the @ref deallocate call is a no-op.
97   void (*deallocate)(void* context, void* pointer);
98   /// Pointer to a function to be called for aligned memory allocation.
99   ///
100   /// @param context - The user-specified pointer from xnn_allocator structure.
101   /// @param alignment - The alignment of the memory block to allocate, in bytes. Alignment is always a power-of-2.
102   /// @param size - The size of the memory block to allocate, in bytes.
103   ///
104   /// @returns Pointer to the allocated memory block of at least @ref size bytes.
105   ///          If allocation fails, the function must return NULL.
106   void* (*aligned_allocate)(void* context, size_t alignment, size_t size);
107   /// Pointer to a function to be called for aligned memory de-allocation.
108   ///
109   /// @param context - The user-specified pointer from xnn_allocator structure.
110   /// @param pointer - Pointer to a memory block allocated by @ref aligned_allocate function. Can be NULL.
111   ///                  If the pointer is NULL, the @ref aligned_deallocate call is a no-op.
112   void (*aligned_deallocate)(void* context, void* pointer);
113 };
114 
115 /// Initialize XNNPACK library.
116 ///
117 /// XNNPACK must be successfully initialized before use.
118 /// During initialization, XNNPACK populates internal structures depending on host processor. It can be time-consuming.
119 ///
120 /// @param[in] allocator - structure with function pointers to be use for memory allocation and de-allocation.
121 ///                        If this argument is NULL, system-provided memory management functions (e.g. malloc/free)
122 ///                        will be used.
123 ///
124 /// @retval xnn_status_success - XNNPACK is succesfully initialized and ready to use.
125 /// @retval xnn_status_out_of_memory - initialization failed due to out-of-memory condition.
126 /// @retval xnn_status_unsupported_hardware - initialization failed because the host processor does not satisfy the
127 ///                                           minimum hardware requirements for XNNPACK. E.g. this may happen on x86
128 ///                                           processors without SSE2 extension, or on 32-bit ARM processors without
129 ///                                           the NEON SIMD extension.
130 enum xnn_status xnn_initialize(const struct xnn_allocator* allocator);
131 
132 /// Deinitialize XNNPACK library.
133 ///
134 /// To avoid memory and resource leaks, users must call xnn_deinitialize once for each successful xnn_initialize call.
135 ///
136 /// @retval xnn_status_success - deinitialization call succeeded.
137 enum xnn_status xnn_deinitialize(void);
138 
139 /// Subgraph is an abstract representation of a neural network model.
140 /// Subgraph objects are used to define Values (tensors) and Nodes (operators) comprising the model.
141 typedef struct xnn_subgraph* xnn_subgraph_t;
142 
143 /// Create a empty Subgraph object.
144 ///
145 /// @param external_value_ids - number of Value IDs to reserve for communication with external graph representation.
146 ///                             The Subgraph object would avoid creating internal Value IDs in the
147 ///                             [0, reserved_value_ids-1] range.
148 /// @param flags - binary features of the subgraph. No supported flags are currently defined.
149 /// @param subgraph_out - pointer to the variable that will be initialized with a handle to the Subgraph object upon
150 ///                       successful return.
151 enum xnn_status xnn_create_subgraph(
152   uint32_t external_value_ids,
153   uint32_t flags,
154   xnn_subgraph_t* subgraph_out);
155 
156 /// Destroy a Subgraph object, as well as Values, and Nodes associated with the subgraph.
157 ///
158 /// @param subgraph - the Subgraph object to destroy.
159 enum xnn_status xnn_delete_subgraph(
160   xnn_subgraph_t subgraph);
161 
162 #define XNN_VALUE_FLAG_EXTERNAL_INPUT  0x00000001
163 #define XNN_VALUE_FLAG_EXTERNAL_OUTPUT 0x00000002
164 
165 #define XNN_INVALID_VALUE_ID UINT32_MAX
166 
167 /// Type of elements in a Value object.
168 enum xnn_datatype {
169   /// Invalid data type. Valid Values never have this datatype.
170   xnn_datatype_invalid = 0,
171   /// IEEE754 single-precision floating-point.
172   xnn_datatype_fp32 = 1,
173   /// IEEE754 half-precision floating-point.
174   xnn_datatype_fp16 = 2,
175 };
176 
177 /// Define a tensor-type Value and add it to a Subgraph.
178 ///
179 /// @param subgraph - a Subgraph object that will own the created Value.
180 /// @param datatype - type of the tensor elements.
181 /// @param num_dims - number of dimensions in the shape.
182 /// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
183 ///               XNNPACK does not keep any pointers to this array after the function returns.
184 /// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
185 ///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
186 ///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
187 /// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
188 ///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
189 ///                      created for the Value.
190 /// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
191 ///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
192 /// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
193 ///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
194 enum xnn_status xnn_define_tensor_value(
195   xnn_subgraph_t subgraph,
196   enum xnn_datatype datatype,
197   size_t num_dims,
198   const size_t* dims,
199   const void* data,
200   uint32_t external_id,
201   uint32_t flags,
202   uint32_t* id_out);
203 
204 /// Define a 2D Convolution Node and add it to a Subgraph.
205 ///
206 /// @param subgraph - a Subgraph object that will own the created Node.
207 /// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
208 ///                            flag is specified.
209 /// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
210 ///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
211 /// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
212 ///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
213 /// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
214 ///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
215 /// @param kernel_height - kernel (filter) height.
216 /// @param kernel_width - kernel (filter) width.
217 /// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
218 /// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
219 /// @param dilation_height - dilation of kernel elements along the height dimension.
220 /// @param dilation_width - dilation of kernel elements along the width dimension.
221 /// @param groups - number of convolution groups.
222 /// @param group_input_channels - number of input channels per group.
223 /// @param group_output_channels - number of output channels per group.
224 /// @param output_min - lower bound for clipping output values.
225 /// @param output_max - upper bound for clipping output values.
226 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
227 ///                   with [N, IH, IW, groups * group_input_channels] dimensions
228 /// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
229 ///                    with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
230 ///                    dimensions.
231 /// @param bias_id - Value ID for the bias tensor. The bias tensor must be a 1D tensor defined in the @a subgraph with
232 ///                  [groups * group_output_channels] dimensions.
233 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
234 ///                    with [N, OH, OW, groups * group_output_channels] dimensions.
235 /// @param flags - binary features of the 2D Convolution Node. The only currently supported values is
236 ///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
237 enum xnn_status xnn_define_convolution_2d(
238   xnn_subgraph_t subgraph,
239   uint32_t input_padding_top,
240   uint32_t input_padding_right,
241   uint32_t input_padding_bottom,
242   uint32_t input_padding_left,
243   uint32_t kernel_height,
244   uint32_t kernel_width,
245   uint32_t subsampling_height,
246   uint32_t subsampling_width,
247   uint32_t dilation_height,
248   uint32_t dilation_width,
249   uint32_t groups,
250   size_t group_input_channels,
251   size_t group_output_channels,
252   float output_min,
253   float output_max,
254   uint32_t input_id,
255   uint32_t filter_id,
256   uint32_t bias_id,
257   uint32_t output_id,
258   uint32_t flags);
259 
260 /// Define a 2D Deconvolution (Transposed Convolution) Node and add it to a Subgraph.
261 ///
262 /// @param subgraph - a Subgraph object that will own the created Node.
263 /// @param padding_top - implicit padding above 2D output data.
264 /// @param padding_right - implicit padding to the right of 2D output data.
265 /// @param padding_bottom - implicit padding below 2D output data.
266 /// @param padding_left - implicit padding to the left of 2D output data.
267 /// @param adjustment_height - additional elements in the bottom of the 2D output data.
268 /// @param adjustment_width - additional elements to the right of the 2D output data.
269 /// @param kernel_height - kernel (filter) height.
270 /// @param kernel_width - kernel (filter) width.
271 /// @param upsampling_height - height of upsampling region for deconvolution input (deconvolution height stride).
272 /// @param upsampling_width - width of upsampling region for deconvolution input (deconvolution width stride).
273 /// @param dilation_height - dilation of kernel elements along the height dimension.
274 /// @param dilation_width - dilation of kernel elements along the width dimension.
275 /// @param groups - number of convolution groups.
276 /// @param group_input_channels - number of input channels per group.
277 /// @param group_output_channels - number of output channels per group.
278 /// @param output_min - lower bound for clipping output values.
279 /// @param output_max - upper bound for clipping output values.
280 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
281 ///                   with [N, IH, IW, groups * group_input_channels] dimensions
282 /// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
283 ///                    with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
284 ///                    dimensions.
285 /// @param bias_id - Value ID for the bias tensor. The bias tensor must be a 1D tensor defined in the @a subgraph with
286 ///                  [groups * group_output_channels] dimensions.
287 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
288 ///                    with [N, OH, OW, groups * group_output_channels] dimensions.
289 /// @param flags - binary features of the 2D Deconvolution Node. No supported flags are currently defined.
290 enum xnn_status xnn_define_deconvolution_2d(
291   xnn_subgraph_t subgraph,
292   uint32_t padding_top,
293   uint32_t padding_right,
294   uint32_t padding_bottom,
295   uint32_t padding_left,
296   uint32_t adjustment_height,
297   uint32_t adjustment_width,
298   uint32_t kernel_height,
299   uint32_t kernel_width,
300   uint32_t upsampling_height,
301   uint32_t upsampling_width,
302   uint32_t dilation_height,
303   uint32_t dilation_width,
304   uint32_t groups,
305   size_t group_input_channels,
306   size_t group_output_channels,
307   float output_min,
308   float output_max,
309   uint32_t input_id,
310   uint32_t filter_id,
311   uint32_t bias_id,
312   uint32_t output_id,
313   uint32_t flags);
314 
315 /// Define a 2D Depthwise Convolution Node and add it to a Subgraph.
316 ///
317 /// @param subgraph - a Subgraph object that will own the created Node.
318 /// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
319 ///                            flag is specified.
320 /// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
321 ///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
322 /// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
323 ///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
324 /// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
325 ///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
326 /// @param kernel_height - kernel (filter) height.
327 /// @param kernel_width - kernel (filter) width.
328 /// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
329 /// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
330 /// @param dilation_height - dilation of kernel elements along the height dimension.
331 /// @param dilation_width - dilation of kernel elements along the width dimension.
332 /// @param depth_multiplier - ratio of output channels to input channels.
333 /// @param input_channels - number of input channels.
334 /// @param output_min - lower bound for clipping output values.
335 /// @param output_max - upper bound for clipping output values.
336 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
337 ///                   with [N, IH, IW, input_channels] dimensions
338 /// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
339 ///                    with [1, kernel_height, kernel_width, input_channels * depth_multiplier] dimensions.
340 /// @param bias_id - Value ID for the bias tensor. The bias tensor must be a 1D tensor defined in the @a subgraph with
341 ///                  [input_channels * depth_multiplier] dimensions.
342 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
343 ///                    with [N, OH, OW, input_channels * depth_multiplier] dimensions.
344 /// @param flags - binary features of the 2D Depthwise Convolution Node. The only currently supported values is
345 ///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
346 enum xnn_status xnn_define_depthwise_convolution_2d(
347   xnn_subgraph_t subgraph,
348   uint32_t input_padding_top,
349   uint32_t input_padding_right,
350   uint32_t input_padding_bottom,
351   uint32_t input_padding_left,
352   uint32_t kernel_height,
353   uint32_t kernel_width,
354   uint32_t subsampling_height,
355   uint32_t subsampling_width,
356   uint32_t dilation_height,
357   uint32_t dilation_width,
358   uint32_t depth_multiplier,
359   size_t input_channels,
360   float output_min,
361   float output_max,
362   uint32_t input_id,
363   uint32_t filter_id,
364   uint32_t bias_id,
365   uint32_t output_id,
366   uint32_t flags);
367 
368 /// Define a DepthToSpace Node and add it to a Subgraph.
369 ///
370 /// The DepthToSpace Node rearranges data from depth into blocks of spatial data (a reverse transform for SpaceToDepth).
371 /// For a given input pixel, an output square of pixels with side @a block_size is formed from values in the corresponding
372 /// number of its channels. The output depth is therefore @a block_size x @a block_size times smaller than that of the input.
373 ///
374 /// @param subgraph - a Subgraph object that will own the created Node.
375 /// @param input_id - Value ID for the input tensor. The input tensor must be divisible by @a block_size in the channel dimension.
376 /// @param output_id - Value ID for the output tensor.
377 /// @param block_size - the size of the spatial block.
378 /// @param flags - binary features of the DepthToSpace Node. No supported flags are currently defined.
379 enum xnn_status xnn_define_depth_to_space(
380   xnn_subgraph_t subgraph,
381   uint32_t input_id,
382   uint32_t output_id,
383   uint32_t block_size,
384   uint32_t flags);
385 
386 /// Define a 2D Global Average Pooling Node and add it to a Subgraph.
387 ///
388 /// @param subgraph - a Subgraph object that will own the created Node.
389 /// @param output_min - lower bound for clipping output values.
390 /// @param output_max - upper bound for clipping output values.
391 /// @param input_id - Value ID for the input tensor. The input tensor must be a
392 ///                   4D tensor defined in the @a subgraph with [N, H, W, C]
393 ///                   dimensions
394 /// @param output_id - Value ID for the output tensor. The output tensor must be
395 ///                    a 4D tensor defined in the @a subgraph with [N, 1, 1, C]
396 ///                    dimensions.
397 /// @param flags - binary features of the 2D Global Average Pooling Node. No
398 ///                supported flags are currently defined.
399 enum xnn_status xnn_define_global_average_pooling_2d(
400   xnn_subgraph_t subgraph,
401   float output_min,
402   float output_max,
403   uint32_t input_id,
404   uint32_t output_id,
405   uint32_t flags);
406 
407 /// Define a 2D Average Pooling Node and add it to a Subgraph.
408 ///
409 /// @param subgraph - a Subgraph object that will own the created Node.
410 /// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
411 ///                            flag is specified.
412 /// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
413 ///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
414 /// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
415 ///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
416 /// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
417 ///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
418 /// @param pooling_height - pooling (kernel) height.
419 /// @param pooling_width - pooling (kernel) width.
420 /// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
421 ///                        to vertically adjacent output pixels.
422 /// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
423 ///                        to horizontally adjacent output pixels.
424 /// @param output_min - lower bound for clipping output values.
425 /// @param output_max - upper bound for clipping output values.
426 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
427 ///                   with [N, IH, IW, channels] dimensions
428 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
429 ///                    with [N, OH, OW, channels] dimensions.
430 /// @param flags - binary features of the 2D Average Pooling Node. The only currently supported values is
431 ///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
432 enum xnn_status xnn_define_average_pooling_2d(
433   xnn_subgraph_t subgraph,
434   uint32_t input_padding_top,
435   uint32_t input_padding_right,
436   uint32_t input_padding_bottom,
437   uint32_t input_padding_left,
438   uint32_t pooling_height,
439   uint32_t pooling_width,
440   uint32_t stride_height,
441   uint32_t stride_width,
442   float output_min,
443   float output_max,
444   uint32_t input_id,
445   uint32_t output_id,
446   uint32_t flags);
447 
448 /// Define a Fully Connected Node and add it to a Subgraph.
449 ///
450 /// @param subgraph - a Subgraph object that will own the created Node.
451 /// @param output_min - lower bound for clipping output values.
452 /// @param output_max - upper bound for clipping output values.
453 /// @param input_id - Value ID for the input tensor. The input tensor must be an
454 /// N-dimensional tensor defined in the @a
455 ///                   subgraph.
456 ///                   If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the
457 ///                   input tensor must be at least 1D and its last dimension
458 ///                   must match the last dimension of the filter tensor. In
459 ///                   particular, if input is a 2D tensor, it must have
460 ///                   [batch_size, input_channels] dimensions. If
461 ///                   XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of
462 ///                   elements in the input tensor must be divisible by the
463 ///                   input_channels. The tensor will be first flattened into a
464 ///                   1D tensor of [num_input_elements] dimensions, then
465 ///                   reshaped into a 2D tensor of [num_input_elements /
466 ///                   input_channels, input_channels] dimensions where
467 ///                   num_input_elements is the total number of elements in the
468 ///                   input tensor.
469 /// @param filter_id - Value ID for the filter tensor. The filter tensor must ge
470 /// a 2D tensor defined in the @a subgraph
471 ///                    with [output_channels, input_channels] dimensions.
472 /// @param bias_id - Value ID for the bias tensor. The bias tensor must be a 1D
473 /// tensor defined in the @a subgraph with
474 ///                  [output_channels] dimensions.
475 /// @param output_id - Value ID for the output tensor. The output tensor must be
476 /// defined in the @a subgraph.
477 ///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the
478 ///                    output tensor must have the same dimensionality as the
479 ///                    input tensor, all its dimensions but the last one must
480 ///                    match the corresponding dimensions of the input tensor,
481 ///                    and the last dimensions of the output tensor must match
482 ///                    the first dimension of the filter tensor. In particular,
483 ///                    if input is a 2D tensor, output must be a 2D tensor of
484 ///                    [batch_size, output_channels] dimensions. If
485 ///                    XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must
486 ///                    be a 2D tensor of [num_input_elements / input_channels,
487 ///                    output_channels] dimensions where num_input_elements is
488 ///                    the total number of elements in the input tensor.
489 /// @param flags - binary features of the Fully Connected Node. The only
490 ///                currently supported value is XNN_FLAG_TENSORFLOW_RESHAPE_2D.
491 enum xnn_status xnn_define_fully_connected(xnn_subgraph_t subgraph,
492                                            float output_min, float output_max,
493                                            uint32_t input_id,
494                                            uint32_t filter_id, uint32_t bias_id,
495                                            uint32_t output_id, uint32_t flags);
496 
497 /// Define a 2D Max Pooling Node and add it to a Subgraph.
498 ///
499 /// @param subgraph - a Subgraph object that will own the created Node.
500 /// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
501 ///                            flag is specified.
502 /// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
503 ///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
504 /// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
505 ///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
506 /// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
507 ///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
508 /// @param pooling_height - pooling (kernel) height.
509 /// @param pooling_width - pooling (kernel) width.
510 /// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
511 ///                        to vertically adjacent output pixels.
512 /// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
513 ///                        to horizontally adjacent output pixels.
514 /// @param dilation_height - dilation of pooling elements along the height dimension.
515 /// @param dilation_width - dilation of pooling elements along the width dimension.
516 /// @param output_min - lower bound for clipping output values.
517 /// @param output_max - upper bound for clipping output values.
518 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
519 ///                   with [N, IH, IW, channels] dimensions
520 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
521 ///                    with [N, OH, OW, channels] dimensions.
522 /// @param flags - binary features of the 2D Max Pooling Node. The only currently supported values is
523 ///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
524 enum xnn_status xnn_define_max_pooling_2d(
525   xnn_subgraph_t subgraph,
526   uint32_t input_padding_top,
527   uint32_t input_padding_right,
528   uint32_t input_padding_bottom,
529   uint32_t input_padding_left,
530   uint32_t pooling_height,
531   uint32_t pooling_width,
532   uint32_t stride_height,
533   uint32_t stride_width,
534   uint32_t dilation_height,
535   uint32_t dilation_width,
536   float output_min,
537   float output_max,
538   uint32_t input_id,
539   uint32_t output_id,
540   uint32_t flags);
541 
542 /// Define a 2D ArgMax Pooling Node and add it to a Subgraph.
543 ///
544 /// @param subgraph - a Subgraph object that will own the created Node.
545 /// @param input_padding_top - implicit zero-padding above 2D input data.
546 /// @param input_padding_right - implicit zero-padding to the right of 2D input data.
547 /// @param input_padding_bottom - implicit zero-padding below 2D input data.
548 /// @param input_padding_left - implicit zero-padding to the left of 2D input data.
549 /// @param pooling_height - pooling (kernel) height. Vertical stride between pooling regions match this value.
550 /// @param pooling_width - pooling (kernel) width. Horizontal stride between pooling regions match this value.
551 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
552 ///                   with [N, IH, IW, channels] dimensions
553 /// @param output_value_id - Value ID for the output tensor with the maximum values in the pools. The output tensor must
554 ///                          be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels] dimensions.
555 /// @param output_index_id - Value ID for the output tensor with the indexes of the maximum values in the pools. The
556 ///                          output tensor must be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels]
557 ///                          dimensions.
558 /// @param flags - binary features of the 2D ArgMax Pooling Node. No supported flags are currently defined.
559 enum xnn_status xnn_define_argmax_pooling_2d(
560   xnn_subgraph_t subgraph,
561   uint32_t input_padding_top,
562   uint32_t input_padding_right,
563   uint32_t input_padding_bottom,
564   uint32_t input_padding_left,
565   uint32_t pooling_height,
566   uint32_t pooling_width,
567   uint32_t input_id,
568   uint32_t output_value_id,
569   uint32_t output_index_id,
570   uint32_t flags);
571 
572 /// Define a 2D UnPooling Node and add it to a Subgraph.
573 ///
574 /// @param subgraph - a Subgraph object that will own the created Node.
575 /// @param padding_top - implicit padding above 2D output data.
576 /// @param padding_right - implicit padding to the right of 2D output data.
577 /// @param padding_bottom - implicit padding below 2D output data.
578 /// @param padding_left - implicit padding to the left of 2D output data.
579 /// @param pooling_height - height of the pooling window.
580 /// @param pooling_width - width of the pooling window.
581 /// @param input_value_id - Value ID for the input tensor with the max-pooling values to invert. The input value tensor
582 ///                         must be a 4D tensor defined in the @a subgraph with [N, IH, IW, channels] dimensions.
583 /// @param input_index_id - Value ID for the input tensor with the indices of the per-pool maximum values produced by
584 ///                         a 2D UnPooling Node. The input tensor must be a 4D tensor defined in the @a subgraph with
585 ///                         [N, IH, IW, channels] dimensions.
586 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
587 ///                    with [N, OH, OW, channels] dimensions.
588 /// @param flags - binary features of the 2D UnPooling Node. No supported flags are currently defined.
589 enum xnn_status xnn_define_unpooling_2d(
590   xnn_subgraph_t subgraph,
591   uint32_t padding_top,
592   uint32_t padding_right,
593   uint32_t padding_bottom,
594   uint32_t padding_left,
595   uint32_t pooling_height,
596   uint32_t pooling_width,
597   uint32_t input_value_id,
598   uint32_t input_index_id,
599   uint32_t output_id,
600   uint32_t flags);
601 
602 /// Define a 2-Input Add Node and add it to a Subgraph.
603 ///
604 /// The 2-Input Add Node computes elementwise addition of two tensor inputs with numpy broadcasting rules.
605 ///
606 /// @param subgraph - a Subgraph object that will own the created Node.
607 /// @param output_min - lower bound for clipping output values.
608 /// @param output_max - upper bound for clipping output values.
609 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
610 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
611 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
612 ///                    that dimension.
613 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
614 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
615 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
616 ///                    that dimension.
617 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
618 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
619 ///                    of the two inputs.
620 /// @param flags - binary features of the Add Node. No supported flags are currently defined.
621 enum xnn_status xnn_define_add2(
622   xnn_subgraph_t subgraph,
623   float output_min,
624   float output_max,
625   uint32_t input1_id,
626   uint32_t input2_id,
627   uint32_t output_id,
628   uint32_t flags);
629 
630 /// Define a 2-Input Multiply Node and add it to a Subgraph.
631 ///
632 /// The 2-Input Multiply Node computes elementwise multiplication of two tensor inputs with numpy broadcasting rules.
633 ///
634 /// @param subgraph - a Subgraph object that will own the created Node.
635 /// @param output_min - lower bound for clipping output values.
636 /// @param output_max - upper bound for clipping output values.
637 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
638 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
639 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
640 ///                    that dimension.
641 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
642 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
643 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
644 ///                    that dimension.
645 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
646 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
647 ///                    of the two inputs.
648 /// @param flags - binary features of the Multiply Node. No supported flags are currently defined.
649 enum xnn_status xnn_define_multiply2(
650   xnn_subgraph_t subgraph,
651   float output_min,
652   float output_max,
653   uint32_t input1_id,
654   uint32_t input2_id,
655   uint32_t output_id,
656   uint32_t flags);
657 
658 /// Define a Subtract Node and add it to a Subgraph.
659 ///
660 /// The Subtract Node computes elementwise subtraction of two tensor inputs with numpy broadcasting rules.
661 ///
662 /// @param subgraph - a Subgraph object that will own the created Node.
663 /// @param output_min - lower bound for clipping output values.
664 /// @param output_max - upper bound for clipping output values.
665 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
666 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
667 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
668 ///                    that dimension.
669 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
670 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
671 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
672 ///                    that dimension.
673 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
674 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
675 ///                    of the two inputs.
676 /// @param flags - binary features of the Subtract Node. No supported flags are currently defined.
677 enum xnn_status xnn_define_subtract(
678   xnn_subgraph_t subgraph,
679   float output_min,
680   float output_max,
681   uint32_t input1_id,
682   uint32_t input2_id,
683   uint32_t output_id,
684   uint32_t flags);
685 
686 /// Define a Divide Node and add it to a Subgraph.
687 ///
688 /// The Divide Node computes elementwise division of two tensor inputs with numpy broadcasting rules.
689 ///
690 /// @param subgraph - a Subgraph object that will own the created Node.
691 /// @param output_min - lower bound for clipping output values.
692 /// @param output_max - upper bound for clipping output values.
693 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
694 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
695 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
696 ///                    that dimension.
697 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
698 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
699 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
700 ///                    that dimension.
701 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
702 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
703 ///                    of the two inputs.
704 /// @param flags - binary features of the Divide Node. No supported flags are currently defined.
705 enum xnn_status xnn_define_divide(
706   xnn_subgraph_t subgraph,
707   float output_min,
708   float output_max,
709   uint32_t input1_id,
710   uint32_t input2_id,
711   uint32_t output_id,
712   uint32_t flags);
713 
714 /// Define a 2-Input Maximum Node and add it to a Subgraph.
715 ///
716 /// The 2-Input Maximum Node computes elementwise maximum of two tensor inputs with numpy broadcasting rules.
717 ///
718 /// @param subgraph - a Subgraph object that will own the created Node.
719 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
720 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
721 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
722 ///                    that dimension.
723 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
724 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
725 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
726 ///                    that dimension.
727 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
728 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
729 ///                    of the two inputs.
730 /// @param flags - binary features of the Maximum Node. No supported flags are currently defined.
731 enum xnn_status xnn_define_maximum2(
732   xnn_subgraph_t subgraph,
733   uint32_t input1_id,
734   uint32_t input2_id,
735   uint32_t output_id,
736   uint32_t flags);
737 
738 /// Define a 2-Input Minimum Node and add it to a Subgraph.
739 ///
740 /// The 2-Input Minimum Node computes elementwise minimum of two tensor inputs with numpy broadcasting rules.
741 ///
742 /// @param subgraph - a Subgraph object that will own the created Node.
743 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
744 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
745 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
746 ///                    that dimension.
747 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
748 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
749 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
750 ///                    that dimension.
751 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
752 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
753 ///                    of the two inputs.
754 /// @param flags - binary features of the Minimum Node. No supported flags are currently defined.
755 enum xnn_status xnn_define_minimum2(
756   xnn_subgraph_t subgraph,
757   uint32_t input1_id,
758   uint32_t input2_id,
759   uint32_t output_id,
760   uint32_t flags);
761 
762 /// Define a Squared Difference Node and add it to a Subgraph.
763 ///
764 /// The Squared Difference Node computes elementwise squared difference of two tensor inputs with numpy broadcasting
765 /// rules.
766 ///
767 /// @param subgraph - a Subgraph object that will own the created Node.
768 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
769 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
770 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
771 ///                    that dimension.
772 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
773 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
774 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
775 ///                    that dimension.
776 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
777 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
778 ///                    of the two inputs.
779 /// @param flags - binary features of the Squared Difference Node. No supported flags are currently defined.
780 enum xnn_status xnn_define_squared_difference(
781   xnn_subgraph_t subgraph,
782   uint32_t input1_id,
783   uint32_t input2_id,
784   uint32_t output_id,
785   uint32_t flags);
786 
787 /// Define a Constant Pad Node with static padding specification and add it to a Subgraph.
788 ///
789 /// @param subgraph - a Subgraph object that will own the created Node.
790 /// @param pre_paddings - number of padding elements to insert before input elements for every dimension. This array
791 ///                       must have as many elements as the the number of dimensions in the input tensor.
792 /// @param post_paddings - number of padding elements to insert after input elements for every dimension. This array
793 ///                       must have as many elements as the the number of dimensions in the input tensor.
794 /// @param padding_value - constant value used to initialize padding elements.
795 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
796 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
797 ///                    shape must match the shape of the input tensor with padding.
798 /// @param flags - binary features of the Constant Pad Node. No supported flags are currently defined.
799 enum xnn_status xnn_define_static_constant_pad(
800   xnn_subgraph_t subgraph,
801   const size_t* pre_paddings,
802   const size_t* post_paddings,
803   float padding_value,
804   uint32_t input_id,
805   uint32_t output_id,
806   uint32_t flags);
807 
808 /// Define a Reshape Node with static shape specification and add it to a Subgraph.
809 ///
810 /// @param subgraph - a Subgraph object that will own the created Node.
811 /// @param num_dims - number of shape dimensions in the output tensor.
812 /// @param new_shape - shape dimensions of the output tensor.
813 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
814 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
815 ///                    shape must match the shape of the input tensor with padding.
816 /// @param flags - binary features of the Reshape Node. No supported flags are currently defined.
817 enum xnn_status xnn_define_static_reshape(
818   xnn_subgraph_t subgraph,
819   size_t num_dims,
820   const size_t* new_shape,
821   uint32_t input_id,
822   uint32_t output_id,
823   uint32_t flags);
824 
825 /// Define a 2D Resize Bilinear Node with static output height & width specification and add it to a Subgraph.
826 ///
827 /// @param subgraph - a Subgraph object that will own the created Node.
828 /// @param new_height - height dimension of the output tensor.
829 /// @param new_width - width dimension of the output tensor.
830 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
831 ///                   with [N, H, W, C] dimensions
832 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
833 ///                    with [N, new_height, new_width, C] dimensions.
834 /// @param flags - binary features of the 2D Resize Bilinear Node. The only currently supported values are
835 ///                XNN_FLAG_TENSORFLOW_LEGACY_MODE and XNN_FLAG_ALIGN_CORNERS, which are mutually exclusive.
836 enum xnn_status xnn_define_static_resize_bilinear_2d(
837   xnn_subgraph_t subgraph,
838   size_t new_height,
839   size_t new_width,
840   uint32_t input_id,
841   uint32_t output_id,
842   uint32_t flags);
843 
844 /// Define a PReLU (Parametric ReLU) Node and add it to a Subgraph.
845 ///
846 /// @param subgraph - a Subgraph object that will own the created Node.
847 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
848 ///                   with [N, H, W, channels] dimensions
849 /// @param slope_id - Value ID for the bias tensor. The bias tensor must be a 1D tensor defined in the @a subgraph with
850 ///                   [channels] dimensions.
851 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
852 ///                    with [N, H, W, channels] dimensions.
853 /// @param flags - binary features of the PReLU Node. No supported flags are currently defined.
854 enum xnn_status xnn_define_prelu(
855   xnn_subgraph_t subgraph,
856   uint32_t input_id,
857   uint32_t slope_id,
858   uint32_t output_id,
859   uint32_t flags);
860 
861 /// Define a Abs Node and add it to a Subgraph.
862 ///
863 /// @param subgraph - a Subgraph object that will own the created Node.
864 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
865 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
866 ///                    shape must match the shape of the input tensor.
867 /// @param flags - binary features of the Abs Node. No supported flags are currently defined.
868 enum xnn_status xnn_define_abs(
869   xnn_subgraph_t subgraph,
870   uint32_t input_id,
871   uint32_t output_id,
872   uint32_t flags);
873 
874 /// Define a Bankers' Rounding Node and add it to a Subgraph.
875 ///
876 /// @param subgraph - a Subgraph object that will own the created Node.
877 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
878 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
879 ///                    shape must match the shape of the input tensor.
880 /// @param flags - binary features of the Bankers' Rounding Node. No supported flags are currently defined.
881 enum xnn_status xnn_define_bankers_rounding(
882   xnn_subgraph_t subgraph,
883   uint32_t input_id,
884   uint32_t output_id,
885   uint32_t flags);
886 
887 /// Define a Ceiling Node and add it to a Subgraph.
888 ///
889 /// @param subgraph - a Subgraph object that will own the created Node.
890 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
891 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
892 ///                    shape must match the shape of the input tensor.
893 /// @param flags - binary features of the Ceiling Node. No supported flags are currently defined.
894 enum xnn_status xnn_define_ceiling(
895   xnn_subgraph_t subgraph,
896   uint32_t input_id,
897   uint32_t output_id,
898   uint32_t flags);
899 
900 /// Define a Clamp Node and add it to a Subgraph.
901 ///
902 /// @param subgraph - a Subgraph object that will own the created Node.
903 /// @param output_min - lower bound for clipping output values.
904 /// @param output_max - upper bound for clipping output values.
905 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
906 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
907 ///                    shape must match the shape of the input tensor.
908 /// @param flags - binary features of the Clamp Node. No supported flags are currently defined.
909 enum xnn_status xnn_define_clamp(
910   xnn_subgraph_t subgraph,
911   float output_min,
912   float output_max,
913   uint32_t input_id,
914   uint32_t output_id,
915   uint32_t flags);
916 
917 /// Define an ELU (Exponential Linear Unit) Node and add it to a Subgraph.
918 ///
919 /// @param subgraph - a Subgraph object that will own the created Node.
920 /// @param alpha - scale factor for negative output elements.
921 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
922 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
923 ///                    shape must match the shape of the input tensor.
924 /// @param flags - binary features of the ELU Node. No supported flags are currently defined.
925 enum xnn_status xnn_define_elu(
926   xnn_subgraph_t subgraph,
927   float alpha,
928   uint32_t input_id,
929   uint32_t output_id,
930   uint32_t flags);
931 
932 /// Define a Floor Node and add it to a Subgraph.
933 ///
934 /// @param subgraph - a Subgraph object that will own the created Node.
935 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
936 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
937 ///                    shape must match the shape of the input tensor.
938 /// @param flags - binary features of the Floor Node. No supported flags are currently defined.
939 enum xnn_status xnn_define_floor(
940   xnn_subgraph_t subgraph,
941   uint32_t input_id,
942   uint32_t output_id,
943   uint32_t flags);
944 
945 /// Define a HardSwish Node and add it to a Subgraph.
946 ///
947 /// @param subgraph - a Subgraph object that will own the created Node.
948 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
949 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
950 ///                    shape must match the shape of the input tensor.
951 /// @param flags - binary features of the HardSwish Node. No supported flags are currently defined.
952 enum xnn_status xnn_define_hardswish(
953   xnn_subgraph_t subgraph,
954   uint32_t input_id,
955   uint32_t output_id,
956   uint32_t flags);
957 
958 /// Define a Leaky ReLU Node and add it to a Subgraph.
959 ///
960 /// @param subgraph - a Subgraph object that will own the created Node.
961 /// @param negative_slope - scale factor for negative input elements.
962 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
963 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
964 ///                    shape must match the shape of the input tensor.
965 /// @param flags - binary features of the Leaky ReLU Node. No supported flags are currently defined.
966 enum xnn_status xnn_define_leaky_relu(
967   xnn_subgraph_t subgraph,
968   float negative_slope,
969   uint32_t input_id,
970   uint32_t output_id,
971   uint32_t flags);
972 
973 /// Define a Negate Node and add it to a Subgraph.
974 ///
975 /// @param subgraph - a Subgraph object that will own the created Node.
976 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
977 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
978 ///                    shape must match the shape of the input tensor.
979 /// @param flags - binary features of the Negate Node. No supported flags are currently defined.
980 enum xnn_status xnn_define_negate(
981   xnn_subgraph_t subgraph,
982   uint32_t input_id,
983   uint32_t output_id,
984   uint32_t flags);
985 
986 /// Define a Sigmoid Node and add it to a Subgraph.
987 ///
988 /// @param subgraph - a Subgraph object that will own the created Node.
989 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
990 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
991 ///                    shape must match the shape of the input tensor.
992 /// @param flags - binary features of the Sigmoid Node. No supported flags are currently defined.
993 enum xnn_status xnn_define_sigmoid(
994   xnn_subgraph_t subgraph,
995   uint32_t input_id,
996   uint32_t output_id,
997   uint32_t flags);
998 
999 /// Define a SoftMax Node and add it to a Subgraph.
1000 ///
1001 /// @param subgraph - a Subgraph object that will own the created Node.
1002 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph, and have at
1003 ///                   least one dimension.
1004 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1005 ///                    shape must match the shape of the input tensor.
1006 /// @param flags - binary features of the SoftMax Node. No supported flags are currently defined.
1007 enum xnn_status xnn_define_softmax(
1008   xnn_subgraph_t subgraph,
1009   uint32_t input_id,
1010   uint32_t output_id,
1011   uint32_t flags);
1012 
1013 /// Define a Square Node and add it to a Subgraph.
1014 ///
1015 /// @param subgraph - a Subgraph object that will own the created Node.
1016 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1017 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1018 ///                    shape must match the shape of the input tensor.
1019 /// @param flags - binary features of the Square Node. No supported flags are currently defined.
1020 enum xnn_status xnn_define_square(
1021   xnn_subgraph_t subgraph,
1022   uint32_t input_id,
1023   uint32_t output_id,
1024   uint32_t flags);
1025 
1026 /// Define a Square Root Node and add it to a Subgraph.
1027 ///
1028 /// @param subgraph - a Subgraph object that will own the created Node.
1029 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1030 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1031 ///                    shape must match the shape of the input tensor.
1032 /// @param flags - binary features of the Square Root Node. No supported flags are currently defined.
1033 enum xnn_status xnn_define_square_root(
1034   xnn_subgraph_t subgraph,
1035   uint32_t input_id,
1036   uint32_t output_id,
1037   uint32_t flags);
1038 
1039 /// Runtime is a combination of an execution plan for subgraph Nodes and a memory manager for subgraph Values.
1040 typedef struct xnn_runtime* xnn_runtime_t;
1041 
1042 /// Create a Runtime object from a subgraph.
1043 ///
1044 /// @param subgraph - a Subgraph object with all Values and Nodes that would be handled by the runtime. No Values or
1045 ///                   Nodes can be added to the runtime once it is constructed.
1046 /// @param threadpool - the thread pool to be used for parallelisation of computations in the runtime. If the thread
1047 ///                     pool is NULL, the computation would run on the caller thread without parallelization.
1048 /// @param flags - binary features of the runtime. The only currently supported value is XNN_FLAG_SPARSE_INFERENCE.
1049 /// @param runtime_out - pointer to the variable that will be initialized with a handle to the Runtime object upon
1050 ///                      successful return. Once constructed, the Runtime object is independent of the Subgraph object
1051 ///                      used to create it.
1052 enum xnn_status xnn_create_runtime_v2(
1053   xnn_subgraph_t subgraph,
1054   pthreadpool_t threadpool,
1055   uint32_t flags,
1056   xnn_runtime_t* runtime_out);
1057 
1058 enum xnn_status xnn_create_runtime(
1059   xnn_subgraph_t subgraph,
1060   xnn_runtime_t* runtime_out);
1061 
1062 struct xnn_external_value {
1063   uint32_t id;
1064   void* data;
1065 };
1066 
1067 /// Setup data pointers for external inputs and outputs in a Runtime object.
1068 ///
1069 /// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
1070 /// @param num_external_values - the number of external inputs and outputs specified in this call. This number must
1071 ///                              match the number of external inputs and outputs in the runtime, i.e. all external
1072 ///                              inputs and outputs in the runtime must be specified in one call.
1073 /// @param external_values - array with location information for all external inputs and outputs in the runtime.
1074 enum xnn_status xnn_setup_runtime(
1075   xnn_runtime_t runtime,
1076   size_t num_external_values,
1077   const struct xnn_external_value* external_values);
1078 
1079 /// Execute forward pass for all operators in the runtime.
1080 ///
1081 /// @param runtime - the Runtime object with the execution plan to invoke.
1082 enum xnn_status xnn_invoke_runtime(
1083   xnn_runtime_t runtime);
1084 
1085 /// Destroy a Runtime object, as well as operators and memory associated with it.
1086 ///
1087 /// @param runtime - the Runtime object to destroy.
1088 enum xnn_status xnn_delete_runtime(
1089   xnn_runtime_t runtime);
1090 
1091 typedef struct xnn_operator* xnn_operator_t;
1092 
1093 enum xnn_status xnn_run_operator(
1094   xnn_operator_t op,
1095   pthreadpool_t threadpool);
1096 
1097 enum xnn_status xnn_delete_operator(
1098   xnn_operator_t op);
1099 
1100 #ifndef XNN_NO_F32_OPERATORS
1101 
1102 enum xnn_status xnn_create_abs_nc_f32(
1103   size_t channels,
1104   size_t input_stride,
1105   size_t output_stride,
1106   uint32_t flags,
1107   xnn_operator_t* abs_op_out);
1108 
1109 enum xnn_status xnn_setup_abs_nc_f32(
1110   xnn_operator_t abs_op,
1111   size_t batch_size,
1112   const float* input,
1113   float* output,
1114   pthreadpool_t threadpool);
1115 
1116 enum xnn_status xnn_create_add_nd_f32(
1117   float output_min,
1118   float output_max,
1119   uint32_t flags,
1120   xnn_operator_t* add_op_out);
1121 
1122 enum xnn_status xnn_setup_add_nd_f32(
1123   xnn_operator_t add_op,
1124   size_t num_input1_dims,
1125   const size_t* input1_shape,
1126   size_t num_input2_dims,
1127   const size_t* input2_shape,
1128   const float* input1,
1129   const float* input2,
1130   float* output,
1131   pthreadpool_t threadpool);
1132 
1133 enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32(
1134   uint32_t input_padding_top,
1135   uint32_t input_padding_right,
1136   uint32_t input_padding_bottom,
1137   uint32_t input_padding_left,
1138   uint32_t pooling_height,
1139   uint32_t pooling_width,
1140   size_t channels,
1141   size_t input_pixel_stride,
1142   size_t output_pixel_stride,
1143   uint32_t flags,
1144   xnn_operator_t* argmax_pooling_op_out);
1145 
1146 enum xnn_status xnn_setup_argmax_pooling2d_nhwc_f32(
1147   xnn_operator_t argmax_pooling_op,
1148   size_t batch_size,
1149   size_t input_height,
1150   size_t input_width,
1151   const float* input,
1152   float* output,
1153   uint32_t* index,
1154   pthreadpool_t threadpool);
1155 
1156 enum xnn_status xnn_create_average_pooling2d_nhwc_f32(
1157   uint32_t input_padding_top,
1158   uint32_t input_padding_right,
1159   uint32_t input_padding_bottom,
1160   uint32_t input_padding_left,
1161   uint32_t pooling_height,
1162   uint32_t pooling_width,
1163   uint32_t stride_height,
1164   uint32_t stride_width,
1165   size_t channels,
1166   size_t input_pixel_stride,
1167   size_t output_pixel_stride,
1168   float output_min,
1169   float output_max,
1170   uint32_t flags,
1171   xnn_operator_t* average_pooling_op_out);
1172 
1173 enum xnn_status xnn_setup_average_pooling2d_nhwc_f32(
1174   xnn_operator_t average_pooling_op,
1175   size_t batch_size,
1176   size_t input_height,
1177   size_t input_width,
1178   const float* input,
1179   float* output,
1180   pthreadpool_t threadpool);
1181 
1182 enum xnn_status xnn_create_bankers_rounding_nc_f32(
1183   size_t channels,
1184   size_t input_stride,
1185   size_t output_stride,
1186   uint32_t flags,
1187   xnn_operator_t* rounding_op_out);
1188 
1189 enum xnn_status xnn_setup_bankers_rounding_nc_f32(
1190   xnn_operator_t rounding_op,
1191   size_t batch_size,
1192   const float* input,
1193   float* output,
1194   pthreadpool_t threadpool);
1195 
1196 enum xnn_status xnn_create_ceiling_nc_f32(
1197   size_t channels,
1198   size_t input_stride,
1199   size_t output_stride,
1200   uint32_t flags,
1201   xnn_operator_t* ceiling_op_out);
1202 
1203 enum xnn_status xnn_setup_ceiling_nc_f32(
1204   xnn_operator_t ceiling_op,
1205   size_t batch_size,
1206   const float* input,
1207   float* output,
1208   pthreadpool_t threadpool);
1209 
1210 enum xnn_status xnn_create_clamp_nc_f32(
1211   size_t channels,
1212   size_t input_stride,
1213   size_t output_stride,
1214   float output_min,
1215   float output_max,
1216   uint32_t flags,
1217   xnn_operator_t* clamp_op_out);
1218 
1219 enum xnn_status xnn_setup_clamp_nc_f32(
1220   xnn_operator_t clamp_op,
1221   size_t batch_size,
1222   const float* input,
1223   float* output,
1224   pthreadpool_t threadpool);
1225 
1226 enum xnn_status xnn_create_convolution2d_nhwc_f32(
1227   uint32_t input_padding_top,
1228   uint32_t input_padding_right,
1229   uint32_t input_padding_bottom,
1230   uint32_t input_padding_left,
1231   uint32_t kernel_height,
1232   uint32_t kernel_width,
1233   uint32_t subsampling_height,
1234   uint32_t subsampling_width,
1235   uint32_t dilation_height,
1236   uint32_t dilation_width,
1237   uint32_t groups,
1238   size_t group_input_channels,
1239   size_t group_output_channels,
1240   size_t input_channel_stride,
1241   size_t output_channel_stride,
1242   const float* kernel,
1243   const float* bias,
1244   float output_min,
1245   float output_max,
1246   uint32_t flags,
1247   xnn_operator_t* convolution_op_out);
1248 
1249 enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1250   xnn_operator_t convolution_op,
1251   size_t batch_size,
1252   size_t input_height,
1253   size_t input_width,
1254   const float* input,
1255   float* output,
1256   pthreadpool_t threadpool);
1257 
1258 enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
1259   uint32_t output_padding_top,
1260   uint32_t output_padding_right,
1261   uint32_t output_padding_bottom,
1262   uint32_t output_padding_left,
1263   uint32_t kernel_height,
1264   uint32_t kernel_width,
1265   uint32_t stride_height,
1266   uint32_t stride_width,
1267   uint32_t dilation_height,
1268   uint32_t dilation_width,
1269   uint32_t groups,
1270   size_t group_input_channels,
1271   size_t group_output_channels,
1272   size_t input_pixel_stride,
1273   size_t output_pixel_stride,
1274   const float* kernel,
1275   const float* bias,
1276   float output_min,
1277   float output_max,
1278   uint32_t flags,
1279   xnn_operator_t* deconvolution_op_out);
1280 
1281 enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
1282   xnn_operator_t deconvolution_op,
1283   size_t batch_size,
1284   size_t input_height,
1285   size_t input_width,
1286   uint32_t adjustment_height,
1287   uint32_t adjustment_width,
1288   const float* input,
1289   float* output,
1290   pthreadpool_t threadpool);
1291 
1292 enum xnn_status xnn_create_divide_nd_f32(
1293   float output_min,
1294   float output_max,
1295   uint32_t flags,
1296   xnn_operator_t* divide_op_out);
1297 
1298 enum xnn_status xnn_setup_divide_nd_f32(
1299   xnn_operator_t divide_op,
1300   size_t num_input1_dims,
1301   const size_t* input1_shape,
1302   size_t num_input2_dims,
1303   const size_t* input2_shape,
1304   const float* input1,
1305   const float* input2,
1306   float* output,
1307   pthreadpool_t threadpool);
1308 
1309 enum xnn_status xnn_create_elu_nc_f32(
1310   size_t channels,
1311   size_t input_stride,
1312   size_t output_stride,
1313   float alpha,
1314   uint32_t flags,
1315   xnn_operator_t* elu_op_out);
1316 
1317 enum xnn_status xnn_setup_elu_nc_f32(
1318   xnn_operator_t elu_op,
1319   size_t batch_size,
1320   const float* input,
1321   float* output,
1322   pthreadpool_t threadpool);
1323 
1324 enum xnn_status xnn_create_fully_connected_nc_f32(
1325   size_t input_channels,
1326   size_t output_channels,
1327   size_t input_stride,
1328   size_t output_stride,
1329   const float* kernel,
1330   const float* bias,
1331   float output_min,
1332   float output_max,
1333   uint32_t flags,
1334   xnn_operator_t* fully_connected_op_out);
1335 
1336 enum xnn_status xnn_setup_fully_connected_nc_f32(
1337   xnn_operator_t fully_connected_op,
1338   size_t batch_size,
1339   const float* input,
1340   float* output,
1341   pthreadpool_t threadpool);
1342 
1343 enum xnn_status xnn_create_floor_nc_f32(
1344   size_t channels,
1345   size_t input_stride,
1346   size_t output_stride,
1347   uint32_t flags,
1348   xnn_operator_t* floor_op_out);
1349 
1350 enum xnn_status xnn_setup_floor_nc_f32(
1351   xnn_operator_t floor_op,
1352   size_t batch_size,
1353   const float* input,
1354   float* output,
1355   pthreadpool_t threadpool);
1356 
1357 enum xnn_status xnn_create_global_average_pooling_nwc_f32(
1358   size_t channels,
1359   size_t input_stride,
1360   size_t output_stride,
1361   float output_min,
1362   float output_max,
1363   uint32_t flags,
1364   xnn_operator_t* global_average_pooling_op_out);
1365 
1366 enum xnn_status xnn_setup_global_average_pooling_nwc_f32(
1367   xnn_operator_t global_average_pooling_op,
1368   size_t batch_size,
1369   size_t width,
1370   const float* input,
1371   float* output,
1372   pthreadpool_t threadpool);
1373 
1374 enum xnn_status xnn_create_hardswish_nc_f32(
1375   size_t channels,
1376   size_t input_stride,
1377   size_t output_stride,
1378   uint32_t flags,
1379   xnn_operator_t* hardswish_op_out);
1380 
1381 enum xnn_status xnn_setup_hardswish_nc_f32(
1382   xnn_operator_t hardswish_op,
1383   size_t batch_size,
1384   const float* input,
1385   float* output,
1386   pthreadpool_t threadpool);
1387 
1388 enum xnn_status xnn_create_leaky_relu_nc_f32(
1389   size_t channels,
1390   size_t input_stride,
1391   size_t output_stride,
1392   float negative_slope,
1393   uint32_t flags,
1394   xnn_operator_t* leaky_relu_op_out);
1395 
1396 enum xnn_status xnn_setup_leaky_relu_nc_f32(
1397   xnn_operator_t leaky_relu_op,
1398   size_t batch_size,
1399   const float* input,
1400   float* output,
1401   pthreadpool_t threadpool);
1402 
1403 enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
1404   uint32_t input_padding_top,
1405   uint32_t input_padding_right,
1406   uint32_t input_padding_bottom,
1407   uint32_t input_padding_left,
1408   uint32_t pooling_height,
1409   uint32_t pooling_width,
1410   uint32_t stride_height,
1411   uint32_t stride_width,
1412   uint32_t dilation_height,
1413   uint32_t dilation_width,
1414   size_t channels,
1415   size_t input_pixel_stride,
1416   size_t output_pixel_stride,
1417   float output_min,
1418   float output_max,
1419   uint32_t flags,
1420   xnn_operator_t* max_pooling_op_out);
1421 
1422 enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
1423   xnn_operator_t max_pooling_op,
1424   size_t batch_size,
1425   size_t input_height,
1426   size_t input_width,
1427   const float* input,
1428   float* output,
1429   pthreadpool_t threadpool);
1430 
1431 enum xnn_status xnn_create_maximum_nd_f32(
1432   uint32_t flags,
1433   xnn_operator_t* maximum_op_out);
1434 
1435 enum xnn_status xnn_setup_maximum_nd_f32(
1436   xnn_operator_t maximum_op,
1437   size_t num_input1_dims,
1438   const size_t* input1_shape,
1439   size_t num_input2_dims,
1440   const size_t* input2_shape,
1441   const float* input1,
1442   const float* input2,
1443   float* output,
1444   pthreadpool_t threadpool);
1445 
1446 enum xnn_status xnn_create_minimum_nd_f32(
1447   uint32_t flags,
1448   xnn_operator_t* minimum_op_out);
1449 
1450 enum xnn_status xnn_setup_minimum_nd_f32(
1451   xnn_operator_t minimum_op,
1452   size_t num_input1_dims,
1453   const size_t* input1_shape,
1454   size_t num_input2_dims,
1455   const size_t* input2_shape,
1456   const float* input1,
1457   const float* input2,
1458   float* output,
1459   pthreadpool_t threadpool);
1460 
1461 enum xnn_status xnn_create_multiply_nd_f32(
1462   float output_min,
1463   float output_max,
1464   uint32_t flags,
1465   xnn_operator_t* multiply_op_out);
1466 
1467 enum xnn_status xnn_setup_multiply_nd_f32(
1468   xnn_operator_t multiply_op,
1469   size_t num_input1_dims,
1470   const size_t* input1_shape,
1471   size_t num_input2_dims,
1472   const size_t* input2_shape,
1473   const float* input1,
1474   const float* input2,
1475   float* output,
1476   pthreadpool_t threadpool);
1477 
1478 enum xnn_status xnn_create_negate_nc_f32(
1479   size_t channels,
1480   size_t input_stride,
1481   size_t output_stride,
1482   uint32_t flags,
1483   xnn_operator_t* negate_op_out);
1484 
1485 enum xnn_status xnn_setup_negate_nc_f32(
1486   xnn_operator_t negate_op,
1487   size_t batch_size,
1488   const float* input,
1489   float* output,
1490   pthreadpool_t threadpool);
1491 
1492 enum xnn_status xnn_create_prelu_nc_f32(
1493   size_t channels,
1494   size_t input_stride,
1495   size_t output_stride,
1496   const float* negative_slope,
1497   uint32_t flags,
1498   xnn_operator_t* prelu_op_out);
1499 
1500 enum xnn_status xnn_setup_prelu_nc_f32(
1501   xnn_operator_t prelu_op,
1502   size_t batch_size,
1503   const float* input,
1504   float* output,
1505   pthreadpool_t threadpool);
1506 
1507 enum xnn_status xnn_create_resize_bilinear2d_nchw_f32(
1508   size_t channels,
1509   size_t input_pixel_stride,
1510   size_t output_pixel_stride,
1511   uint32_t flags,
1512   xnn_operator_t* resize_op_out);
1513 
1514 enum xnn_status xnn_setup_resize_bilinear2d_nchw_f32(
1515   xnn_operator_t resize_op,
1516   size_t batch_size,
1517   size_t input_height,
1518   size_t input_width,
1519   size_t output_height,
1520   size_t output_width,
1521   const float* input,
1522   float* output,
1523   pthreadpool_t threadpool);
1524 
1525 enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32(
1526   size_t channels,
1527   size_t input_pixel_stride,
1528   size_t output_pixel_stride,
1529   uint32_t flags,
1530   xnn_operator_t* resize_op_out);
1531 
1532 enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32(
1533   xnn_operator_t resize_op,
1534   size_t batch_size,
1535   size_t input_height,
1536   size_t input_width,
1537   size_t output_height,
1538   size_t output_width,
1539   const float* input,
1540   float* output,
1541   pthreadpool_t threadpool);
1542 
1543 enum xnn_status xnn_create_sigmoid_nc_f32(
1544   size_t channels,
1545   size_t input_stride,
1546   size_t output_stride,
1547   uint32_t flags,
1548   xnn_operator_t* sigmoid_op_out);
1549 
1550 enum xnn_status xnn_setup_sigmoid_nc_f32(
1551   xnn_operator_t sigmoid_op,
1552   size_t batch_size,
1553   const float* input,
1554   float* output,
1555   pthreadpool_t threadpool);
1556 
1557 enum xnn_status xnn_create_softmax_nc_f32(
1558   size_t channels,
1559   size_t input_stride,
1560   size_t output_stride,
1561   uint32_t flags,
1562   xnn_operator_t* softmax_op_out);
1563 
1564 enum xnn_status xnn_setup_softmax_nc_f32(
1565   xnn_operator_t softmax_op,
1566   size_t batch_size,
1567   const float* input,
1568   float* output,
1569   pthreadpool_t threadpool);
1570 
1571 enum xnn_status xnn_create_square_nc_f32(
1572   size_t channels,
1573   size_t input_stride,
1574   size_t output_stride,
1575   uint32_t flags,
1576   xnn_operator_t* square_op_out);
1577 
1578 enum xnn_status xnn_setup_square_nc_f32(
1579   xnn_operator_t square_op,
1580   size_t batch_size,
1581   const float* input,
1582   float* output,
1583   pthreadpool_t threadpool);
1584 
1585 enum xnn_status xnn_create_square_root_nc_f32(
1586   size_t channels,
1587   size_t input_stride,
1588   size_t output_stride,
1589   uint32_t flags,
1590   xnn_operator_t* sqrt_op_out);
1591 
1592 enum xnn_status xnn_setup_square_root_nc_f32(
1593   xnn_operator_t sqrt_op,
1594   size_t batch_size,
1595   const float* input,
1596   float* output,
1597   pthreadpool_t threadpool);
1598 
1599 enum xnn_status xnn_create_squared_difference_nd_f32(
1600   uint32_t flags,
1601   xnn_operator_t* squared_difference_op_out);
1602 
1603 enum xnn_status xnn_setup_squared_difference_nd_f32(
1604   xnn_operator_t squared_difference_op,
1605   size_t num_input1_dims,
1606   const size_t* input1_shape,
1607   size_t num_input2_dims,
1608   const size_t* input2_shape,
1609   const float* input1,
1610   const float* input2,
1611   float* output,
1612   pthreadpool_t threadpool);
1613 
1614 enum xnn_status xnn_create_subtract_nd_f32(
1615   float output_min,
1616   float output_max,
1617   uint32_t flags,
1618   xnn_operator_t* subtract_op_out);
1619 
1620 enum xnn_status xnn_setup_subtract_nd_f32(
1621   xnn_operator_t subtract_op,
1622   size_t num_input1_dims,
1623   const size_t* input1_shape,
1624   size_t num_input2_dims,
1625   const size_t* input2_shape,
1626   const float* input1,
1627   const float* input2,
1628   float* output,
1629   pthreadpool_t threadpool);
1630 
1631 enum xnn_status xnn_create_truncation_nc_f32(
1632   size_t channels,
1633   size_t input_stride,
1634   size_t output_stride,
1635   uint32_t flags,
1636   xnn_operator_t* truncation_op_out);
1637 
1638 enum xnn_status xnn_setup_truncation_nc_f32(
1639   xnn_operator_t truncation_op,
1640   size_t batch_size,
1641   const float* input,
1642   float* output,
1643   pthreadpool_t threadpool);
1644 
1645 #ifndef XNN_NO_NCHW_OPERATORS
1646 
1647 enum xnn_status xnn_create_convolution2d_nchw_f32(
1648   uint32_t input_padding_top,
1649   uint32_t input_padding_right,
1650   uint32_t input_padding_bottom,
1651   uint32_t input_padding_left,
1652   uint32_t kernel_height,
1653   uint32_t kernel_width,
1654   uint32_t subsampling_height,
1655   uint32_t subsampling_width,
1656   uint32_t dilation_height,
1657   uint32_t dilation_width,
1658   uint32_t groups,
1659   size_t group_input_channels,
1660   size_t group_output_channels,
1661   size_t input_channel_stride,
1662   size_t output_channel_stride,
1663   const float* kernel,
1664   const float* bias,
1665   float output_min,
1666   float output_max,
1667   uint32_t flags,
1668   xnn_operator_t* convolution_op_out);
1669 
1670 enum xnn_status xnn_setup_convolution2d_nchw_f32(
1671   xnn_operator_t convolution_op,
1672   size_t batch_size,
1673   size_t input_height,
1674   size_t input_width,
1675   const float* input,
1676   float* output,
1677   pthreadpool_t threadpool);
1678 
1679 enum xnn_status xnn_create_global_average_pooling_ncw_f32(
1680   size_t channels,
1681   float output_min,
1682   float output_max,
1683   uint32_t flags,
1684   xnn_operator_t* global_average_pooling_op_out);
1685 
1686 enum xnn_status xnn_setup_global_average_pooling_ncw_f32(
1687   xnn_operator_t global_average_pooling_op,
1688   size_t batch_size,
1689   size_t width,
1690   const float* input,
1691   float* output,
1692   pthreadpool_t threadpool);
1693 
1694 #endif  // XNN_NO_NCHW_OPERATORS
1695 
1696 #endif  // XNN_NO_F32_OPERATORS
1697 
1698 #ifndef XNN_NO_X32_OPERATORS
1699 
1700 enum xnn_status xnn_create_channel_shuffle_nc_x32(
1701   size_t groups,
1702   size_t group_channels,
1703   size_t input_stride,
1704   size_t output_stride,
1705   uint32_t flags,
1706   xnn_operator_t* channel_shuffle_op_out);
1707 
1708 enum xnn_status xnn_setup_channel_shuffle_nc_x32(
1709   xnn_operator_t channel_shuffle_op,
1710   size_t batch_size,
1711   const void* input,
1712   void* output,
1713   pthreadpool_t threadpool);
1714 
1715 enum xnn_status xnn_create_constant_pad_nd_x32(
1716   const void* padding_value,
1717   uint32_t flags,
1718   xnn_operator_t* constant_pad_op_out);
1719 
1720 enum xnn_status xnn_setup_constant_pad_nd_x32(
1721   xnn_operator_t constant_pad_op,
1722   size_t num_dims,
1723   const size_t* input_shape,
1724   const size_t* pre_padding,
1725   const size_t* post_padding,
1726   const void* input,
1727   void* output,
1728   pthreadpool_t threadpool);
1729 
1730 enum xnn_status xnn_create_copy_nc_x32(
1731   size_t channels,
1732   size_t input_stride,
1733   size_t output_stride,
1734   uint32_t flags,
1735   xnn_operator_t* copy_op_out);
1736 
1737 enum xnn_status xnn_setup_copy_nc_x32(
1738   xnn_operator_t copy_op,
1739   size_t batch_size,
1740   const void* input,
1741   void* output,
1742   pthreadpool_t threadpool);
1743 
1744 enum xnn_status xnn_create_depth_to_space_nhwc_x32(
1745   size_t output_channels,
1746   size_t input_channel_stride,
1747   size_t output_channel_stride,
1748   uint32_t block_size,
1749   uint32_t flags,
1750   xnn_operator_t* depth_to_space_op_out);
1751 
1752 enum xnn_status xnn_setup_depth_to_space_nhwc_x32(
1753   xnn_operator_t depth_to_space_op,
1754   size_t batch_size,
1755   size_t input_height,
1756   size_t input_width,
1757   const void* input,
1758   void* output,
1759   pthreadpool_t threadpool);
1760 
1761 enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x32(
1762   size_t output_channels,
1763   size_t input_channel_stride,
1764   size_t output_channel_stride,
1765   uint32_t block_size,
1766   uint32_t flags,
1767   xnn_operator_t* depth_to_space_op_out);
1768 
1769 enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x32(
1770   xnn_operator_t depth_to_space_op,
1771   size_t batch_size,
1772   size_t input_height,
1773   size_t input_width,
1774   const void* input,
1775   void* output,
1776   pthreadpool_t threadpool);
1777 
1778 enum xnn_status xnn_create_unpooling2d_nhwc_x32(
1779   uint32_t input_padding_top,
1780   uint32_t input_padding_right,
1781   uint32_t input_padding_bottom,
1782   uint32_t input_padding_left,
1783   uint32_t pooling_height,
1784   uint32_t pooling_width,
1785   size_t channels,
1786   size_t input_pixel_stride,
1787   size_t output_pixel_stride,
1788   uint32_t flags,
1789   xnn_operator_t* unpooling_op_out);
1790 
1791 enum xnn_status xnn_setup_unpooling2d_nhwc_x32(
1792   xnn_operator_t unpooling_op,
1793   size_t batch_size,
1794   size_t input_height,
1795   size_t input_width,
1796   const void* input,
1797   const uint32_t* index,
1798   void* output,
1799   pthreadpool_t threadpool);
1800 
1801 #endif  // XNN_NO_X32_OPERATORS
1802 
1803 #ifndef XNN_NO_F16_OPERATORS
1804 
1805 enum xnn_status xnn_create_add_nd_f16(
1806   float output_min,
1807   float output_max,
1808   uint32_t flags,
1809   xnn_operator_t* add_op_out);
1810 
1811 enum xnn_status xnn_setup_add_nd_f16(
1812   xnn_operator_t add_op,
1813   size_t num_input1_dims,
1814   const size_t* input1_shape,
1815   size_t num_input2_dims,
1816   const size_t* input2_shape,
1817   const void* input1,
1818   const void* input2,
1819   void* output,
1820   pthreadpool_t threadpool);
1821 
1822 enum xnn_status xnn_create_convolution2d_nhwc_f16(
1823   uint32_t input_padding_top,
1824   uint32_t input_padding_right,
1825   uint32_t input_padding_bottom,
1826   uint32_t input_padding_left,
1827   uint32_t kernel_height,
1828   uint32_t kernel_width,
1829   uint32_t subsampling_height,
1830   uint32_t subsampling_width,
1831   uint32_t dilation_height,
1832   uint32_t dilation_width,
1833   uint32_t groups,
1834   size_t group_input_channels,
1835   size_t group_output_channels,
1836   size_t input_channel_stride,
1837   size_t output_channel_stride,
1838   const void* kernel,
1839   const void* bias,
1840   float output_min,
1841   float output_max,
1842   uint32_t flags,
1843   xnn_operator_t* convolution_op_out);
1844 
1845 enum xnn_status xnn_setup_convolution2d_nhwc_f16(
1846   xnn_operator_t convolution_op,
1847   size_t batch_size,
1848   size_t input_height,
1849   size_t input_width,
1850   const void* input,
1851   void* output,
1852   pthreadpool_t threadpool);
1853 
1854 enum xnn_status xnn_create_global_average_pooling_nwc_f16(
1855   size_t channels,
1856   size_t input_stride,
1857   size_t output_stride,
1858   float output_min,
1859   float output_max,
1860   uint32_t flags,
1861   xnn_operator_t* global_average_pooling_op_out);
1862 
1863 enum xnn_status xnn_setup_global_average_pooling_nwc_f16(
1864   xnn_operator_t global_average_pooling_op,
1865   size_t batch_size,
1866   size_t width,
1867   const void* input,
1868   void* output,
1869   pthreadpool_t threadpool);
1870 
1871 enum xnn_status xnn_create_hardswish_nc_f16(
1872   size_t channels,
1873   size_t input_stride,
1874   size_t output_stride,
1875   uint32_t flags,
1876   xnn_operator_t* hardswish_op_out);
1877 
1878 enum xnn_status xnn_setup_hardswish_nc_f16(
1879   xnn_operator_t hardswish_op,
1880   size_t batch_size,
1881   const void* input,
1882   void* output,
1883   pthreadpool_t threadpool);
1884 
1885 enum xnn_status xnn_create_multiply_nd_f16(
1886   float output_min,
1887   float output_max,
1888   uint32_t flags,
1889   xnn_operator_t* multiply_op_out);
1890 
1891 enum xnn_status xnn_setup_multiply_nd_f16(
1892   xnn_operator_t multiply_op,
1893   size_t num_input1_dims,
1894   const size_t* input1_shape,
1895   size_t num_input2_dims,
1896   const size_t* input2_shape,
1897   const void* input1,
1898   const void* input2,
1899   void* output,
1900   pthreadpool_t threadpool);
1901 
1902 #endif  // XNN_NO_F16_OPERATORS
1903 
1904 #ifndef XNN_NO_QS8_OPERATORS
1905 
1906 enum xnn_status xnn_create_add_nd_qs8(
1907   int8_t input1_zero_point,
1908   float input1_scale,
1909   int8_t input2_zero_point,
1910   float input2_scale,
1911   int8_t output_zero_point,
1912   float output_scale,
1913   int8_t output_min,
1914   int8_t output_max,
1915   uint32_t flags,
1916   xnn_operator_t* add_op_out);
1917 
1918 enum xnn_status xnn_setup_add_nd_qs8(
1919   xnn_operator_t add_op,
1920   size_t num_input1_dims,
1921   const size_t* input1_shape,
1922   size_t num_input2_dims,
1923   const size_t* input2_shape,
1924   const int8_t* input1,
1925   const int8_t* input2,
1926   int8_t* output,
1927   pthreadpool_t threadpool);
1928 
1929 enum xnn_status xnn_create_convolution2d_nhwc_qs8(
1930   uint32_t input_padding_top,
1931   uint32_t input_padding_right,
1932   uint32_t input_padding_bottom,
1933   uint32_t input_padding_left,
1934   uint32_t kernel_height,
1935   uint32_t kernel_width,
1936   uint32_t subsampling_height,
1937   uint32_t subsampling_width,
1938   uint32_t dilation_height,
1939   uint32_t dilation_width,
1940   uint32_t groups,
1941   size_t group_input_channels,
1942   size_t group_output_channels,
1943   size_t input_channel_stride,
1944   size_t output_channel_stride,
1945   int8_t input_zero_point,
1946   float input_scale,
1947   float kernel_scale,
1948   const int8_t* kernel,
1949   const int32_t* bias,
1950   int8_t output_zero_point,
1951   float output_scale,
1952   int8_t output_min,
1953   int8_t output_max,
1954   uint32_t flags,
1955   xnn_operator_t* convolution_op_out);
1956 
1957 enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
1958   xnn_operator_t convolution_op,
1959   size_t batch_size,
1960   size_t input_height,
1961   size_t input_width,
1962   const int8_t* input,
1963   int8_t* output,
1964   pthreadpool_t threadpool);
1965 
1966 enum xnn_status xnn_create_global_average_pooling_nwc_qs8(
1967   size_t channels,
1968   size_t input_stride,
1969   size_t output_stride,
1970   int8_t input_zero_point,
1971   float input_scale,
1972   int8_t output_zero_point,
1973   float output_scale,
1974   int8_t output_min,
1975   int8_t output_max,
1976   uint32_t flags,
1977   xnn_operator_t* global_average_pooling_op_out);
1978 
1979 enum xnn_status xnn_setup_global_average_pooling_nwc_qs8(
1980   xnn_operator_t global_average_pooling_op,
1981   size_t batch_size,
1982   size_t width,
1983   const int8_t* input,
1984   int8_t* output,
1985   pthreadpool_t threadpool);
1986 
1987 #endif  // XNN_NO_QS8_OPERATORS
1988 
1989 #ifndef XNN_NO_QU8_OPERATORS
1990 
1991 enum xnn_status xnn_create_average_pooling2d_nhwc_qu8(
1992   uint32_t input_padding_top,
1993   uint32_t input_padding_right,
1994   uint32_t input_padding_bottom,
1995   uint32_t input_padding_left,
1996   uint32_t pooling_height,
1997   uint32_t pooling_width,
1998   uint32_t stride_height,
1999   uint32_t stride_width,
2000   size_t channels,
2001   size_t input_pixel_stride,
2002   size_t output_pixel_stride,
2003   uint8_t input_zero_point,
2004   float input_scale,
2005   uint8_t output_zero_point,
2006   float output_scale,
2007   uint8_t output_min,
2008   uint8_t output_max,
2009   uint32_t flags,
2010   xnn_operator_t* average_pooling_op_out);
2011 
2012 enum xnn_status xnn_setup_average_pooling2d_nhwc_qu8(
2013   xnn_operator_t average_pooling_op,
2014   size_t batch_size,
2015   size_t input_height,
2016   size_t input_width,
2017   const uint8_t* input,
2018   uint8_t* output,
2019   pthreadpool_t threadpool);
2020 
2021 enum xnn_status xnn_create_convolution2d_nhwc_qu8(
2022   uint32_t input_padding_top,
2023   uint32_t input_padding_right,
2024   uint32_t input_padding_bottom,
2025   uint32_t input_padding_left,
2026   uint32_t kernel_height,
2027   uint32_t kernel_width,
2028   uint32_t subsampling_height,
2029   uint32_t subsampling_width,
2030   uint32_t dilation_height,
2031   uint32_t dilation_width,
2032   uint32_t groups,
2033   size_t group_input_channels,
2034   size_t group_output_channels,
2035   size_t input_channel_stride,
2036   size_t output_channel_stride,
2037   uint8_t input_zero_point,
2038   float input_scale,
2039   uint8_t kernel_zero_point,
2040   float kernel_scale,
2041   const uint8_t* kernel,
2042   const int32_t* bias,
2043   uint8_t output_zero_point,
2044   float output_scale,
2045   uint8_t output_min,
2046   uint8_t output_max,
2047   uint32_t flags,
2048   xnn_operator_t* convolution_op_out);
2049 
2050 enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
2051   xnn_operator_t convolution_op,
2052   size_t batch_size,
2053   size_t input_height,
2054   size_t input_width,
2055   const uint8_t* input,
2056   uint8_t* output,
2057   pthreadpool_t threadpool);
2058 
2059 enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
2060   uint32_t output_padding_top,
2061   uint32_t output_padding_right,
2062   uint32_t output_padding_bottom,
2063   uint32_t output_padding_left,
2064   uint32_t kernel_height,
2065   uint32_t kernel_width,
2066   uint32_t stride_height,
2067   uint32_t stride_width,
2068   uint32_t dilation_height,
2069   uint32_t dilation_width,
2070   uint32_t groups,
2071   size_t group_input_channels,
2072   size_t group_output_channels,
2073   size_t input_pixel_stride,
2074   size_t output_pixel_stride,
2075   uint8_t input_zero_point,
2076   float input_scale,
2077   uint8_t kernel_zero_point,
2078   float kernel_scale,
2079   const uint8_t* kernel,
2080   const int32_t* bias,
2081   uint8_t output_zero_point,
2082   float output_scale,
2083   uint8_t output_min,
2084   uint8_t output_max,
2085   uint32_t flags,
2086   xnn_operator_t* deconvolution_op_out);
2087 
2088 enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
2089   xnn_operator_t deconvolution_op,
2090   size_t batch_size,
2091   size_t input_height,
2092   size_t input_width,
2093   uint32_t adjustment_height,
2094   uint32_t adjustment_width,
2095   const uint8_t* input,
2096   uint8_t* output,
2097   pthreadpool_t threadpool);
2098 
2099 enum xnn_status xnn_create_fully_connected_nc_qu8(
2100   size_t input_channels,
2101   size_t output_channels,
2102   size_t input_stride,
2103   size_t output_stride,
2104   uint8_t input_zero_point,
2105   float input_scale,
2106   uint8_t kernel_zero_point,
2107   float kernel_scale,
2108   const uint8_t* kernel,
2109   const int32_t* bias,
2110   uint8_t output_zero_point,
2111   float output_scale,
2112   uint8_t output_min,
2113   uint8_t output_max,
2114   uint32_t flags,
2115   xnn_operator_t* fully_connected_op_out);
2116 
2117 enum xnn_status xnn_setup_fully_connected_nc_qu8(
2118   xnn_operator_t fully_connected_op,
2119   size_t batch_size,
2120   const uint8_t* input,
2121   uint8_t* output,
2122   pthreadpool_t threadpool);
2123 
2124 enum xnn_status xnn_create_global_average_pooling_nwc_qu8(
2125   size_t channels,
2126   size_t input_stride,
2127   size_t output_stride,
2128   uint8_t input_zero_point,
2129   float input_scale,
2130   uint8_t output_zero_point,
2131   float output_scale,
2132   uint8_t output_min,
2133   uint8_t output_max,
2134   uint32_t flags,
2135   xnn_operator_t* global_average_pooling_op_out);
2136 
2137 enum xnn_status xnn_setup_global_average_pooling_nwc_qu8(
2138   xnn_operator_t global_average_pooling_op,
2139   size_t batch_size,
2140   size_t width,
2141   const uint8_t* input,
2142   uint8_t* output,
2143   pthreadpool_t threadpool);
2144 
2145 enum xnn_status xnn_create_leaky_relu_nc_qu8(
2146   size_t channels,
2147   size_t input_stride,
2148   size_t output_stride,
2149   float negative_slope,
2150   uint8_t input_zero_point,
2151   float input_scale,
2152   uint8_t output_zero_point,
2153   float output_scale,
2154   uint8_t output_min,
2155   uint8_t output_max,
2156   uint32_t flags,
2157   xnn_operator_t* leaky_relu_op_out);
2158 
2159 enum xnn_status xnn_setup_leaky_relu_nc_qu8(
2160   xnn_operator_t leaky_relu_op,
2161   size_t batch_size,
2162   const uint8_t* input,
2163   uint8_t* output,
2164   pthreadpool_t threadpool);
2165 
2166 enum xnn_status xnn_create_sigmoid_nc_qu8(
2167   size_t channels,
2168   size_t input_stride,
2169   size_t output_stride,
2170   uint8_t input_zero_point,
2171   float input_scale,
2172   uint8_t output_zero_point,
2173   float output_scale,
2174   uint8_t output_min,
2175   uint8_t output_max,
2176   uint32_t flags,
2177   xnn_operator_t* sigmoid_op_out);
2178 
2179 enum xnn_status xnn_setup_sigmoid_nc_qu8(
2180   xnn_operator_t sigmoid_op,
2181   size_t batch_size,
2182   const uint8_t* input,
2183   uint8_t* output,
2184   pthreadpool_t threadpool);
2185 
2186 enum xnn_status xnn_create_softmax_nc_qu8(
2187   size_t channels,
2188   size_t input_stride,
2189   size_t output_stride,
2190   float input_scale,
2191   uint8_t output_zero_point,
2192   float output_scale,
2193   uint32_t flags,
2194   xnn_operator_t* softmax_op_out);
2195 
2196 enum xnn_status xnn_setup_softmax_nc_qu8(
2197   xnn_operator_t softmax_op,
2198   size_t batch_size,
2199   const uint8_t* input,
2200   uint8_t* output,
2201   pthreadpool_t threadpool);
2202 
2203 #endif  // XNN_NO_QU8_OPERATORS
2204 
2205 #ifndef XNN_NO_U8_OPERATORS
2206 
2207 enum xnn_status xnn_create_clamp_nc_u8(
2208   size_t channels,
2209   size_t input_stride,
2210   size_t output_stride,
2211   uint8_t output_min,
2212   uint8_t output_max,
2213   uint32_t flags,
2214   xnn_operator_t* clamp_op_out);
2215 
2216 enum xnn_status xnn_setup_clamp_nc_u8(
2217   xnn_operator_t clamp_op,
2218   size_t batch_size,
2219   const uint8_t* input,
2220   uint8_t* output,
2221   pthreadpool_t threadpool);
2222 
2223 enum xnn_status xnn_create_max_pooling2d_nhwc_u8(
2224   uint32_t input_padding_top,
2225   uint32_t input_padding_right,
2226   uint32_t input_padding_bottom,
2227   uint32_t input_padding_left,
2228   uint32_t pooling_height,
2229   uint32_t pooling_width,
2230   uint32_t stride_height,
2231   uint32_t stride_width,
2232   uint32_t dilation_height,
2233   uint32_t dilation_width,
2234   size_t channels,
2235   size_t input_pixel_stride,
2236   size_t output_pixel_stride,
2237   uint8_t output_min,
2238   uint8_t output_max,
2239   uint32_t flags,
2240   xnn_operator_t* max_pooling_op_out);
2241 
2242 enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
2243   xnn_operator_t max_pooling_op,
2244   size_t batch_size,
2245   size_t input_height,
2246   size_t input_width,
2247   const uint8_t* input,
2248   uint8_t* output,
2249   pthreadpool_t threadpool);
2250 
2251 #endif  // XNN_NO_U8_OPERATORS
2252 
2253 #ifndef XNN_NO_X8_OPERATORS
2254 
2255 enum xnn_status xnn_create_channel_shuffle_nc_x8(
2256   size_t groups,
2257   size_t group_channels,
2258   size_t input_stride,
2259   size_t output_stride,
2260   uint32_t flags,
2261   xnn_operator_t* channel_shuffle_op_out);
2262 
2263 enum xnn_status xnn_setup_channel_shuffle_nc_x8(
2264   xnn_operator_t channel_shuffle_op,
2265   size_t batch_size,
2266   const void* input,
2267   void* output,
2268   pthreadpool_t threadpool);
2269 
2270 #endif  // XNN_NO_X8_OPERATORS
2271 
2272 #ifdef __cplusplus
2273 }  // extern "C"
2274 #endif
2275