1 /*
2  * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_COMMON_CNN_H_
13 #define AOM_AV1_COMMON_CNN_H_
14 
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18 
19 #include <math.h>
20 
21 #include "aom_util/aom_thread.h"
22 #include "config/av1_rtcd.h"
23 
24 struct AV1Common;
25 
26 #define CNN_MAX_HIDDEN_LAYERS 64
27 #define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1)
28 #define CNN_MAX_CHANNELS 256
29 #define CNN_MAX_BRANCHES 4
30 #define CNN_MAX_THREADS 32
31 
32 #define NO_BRANCH_CONFIG \
33   { 0, 0, 0 }
34 #define NO_BN_PARAMS \
35   { NULL, NULL, NULL, NULL }
36 
37 enum {
38   PADDING_SAME_ZERO,       // tensorflow's SAME padding with pixels outside
39                            // the image area assumed to be 0 (default)
40   PADDING_SAME_REPLICATE,  // tensorflow's SAME padding with pixels outside
41                            // the image area replicated from closest edge
42   PADDING_VALID            // tensorflow's VALID padding
43 } UENUM1BYTE(PADDING_TYPE);
44 
45 // enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION);
46 
47 // Times when input tensor may be copied to branches given in input_to_branches.
48 // BRANCH_NO_COPY: doesn't copy any tensor.
49 // BRANCH_INPUT: copies the input tensor to branches.
50 // BRANCH_OUTPUT: copies the convolved tensor to branches.
51 // BRANCH_COMBINED: copies the combined (after convolving and branch combining)
52 //   tensor. If no combinations happen at this layer, then this option
53 //   has the same effect as COPY_OUTPUT.
54 enum {
55   BRANCH_NO_COPY,
56   BRANCH_INPUT,
57   BRANCH_OUTPUT,
58   BRANCH_COMBINED
59 } UENUM1BYTE(BRANCH_COPY);
60 
61 // Types of combining branches with output of current layer:
62 // BRANCH_NOC: no branch combining
63 // BRANCH_ADD: Add previously stored branch tensor to output of layer
64 // BRANCH_CAT: Concatenate branch tensor to output of layer
65 enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE);
66 
67 // The parameters used to scale each channel in batch
68 // normalization. The processing in done on a per-channel basis.
69 // e.g. bn_mean[c] is the mean for all pixels in channel c. This
70 // is always applied after activation. The output is given by
71 // out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where
72 // norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c]
73 // here we assume that the effect of variance_epsilon is already
74 // taken into account when bn_std is calculated. The pointers
75 // needs to be either all zero or all valid. If all zero, then
76 // batchnorm is disabled, else batchnorm is applied.
77 struct CNN_BATCHNORM_PARAMS {
78   const float *bn_gamma;
79   const float *bn_beta;
80   const float *bn_mean;
81   const float *bn_std;
82 };
83 
84 struct CNN_BRANCH_CONFIG {
85   int input_to_branches;  // If nonzero, copy the active tensor to the current
86   // layer and store for future use in branches
87   // specified in the field as a binary mask. For
88   // example, if input_to_branch = 0x06, it means the
89   // input tensor to the current branch is copied to
90   // branches 1 and 2 (where 0 represents the primary
91   // branch). One restriction is that the mask
92   // cannot indicate copying to the current branch.
93   // If greater than 0, only copies the channels up
94   // to the given index.
95   int channels_to_copy;  // Within the layer, input a copy of active
96   // tensor to branches given in input_to_branches.
97   int branches_to_combine;  // mask of branches to combine with output of
98   // current layer, if
99   // branch_combine_type != BRANCH_NOC
100   // For example, if branches_to_combine = 0x0A,
101   // it means that braches 1 and 3 are combined
102   // with the current branch.
103 };
104 
105 struct CNN_LAYER_CONFIG {
106   int in_channels;
107   int filter_width;
108   int filter_height;
109   int out_channels;
110   int skip_width;
111   int skip_height;
112   int maxpool;            // whether to use maxpool or not (only effective when
113                           // skip width or skip_height are > 1)
114   const float *weights;   // array of length filter_height x filter_width x
115                           // in_channels x out_channels where the inner-most
116                           // scan is out_channels and the outer most scan is
117                           // filter_height.
118   const float *bias;      // array of length out_channels
119   PADDING_TYPE pad;       // padding type
120   ACTIVATION activation;  // the activation function to use after convolution
121   int deconvolve;         // whether this is a deconvolution layer.
122                           // 0: If skip_width or skip_height are > 1, then we
123                           // reduce resolution
124                           // 1: If skip_width or skip_height are > 1, then we
125                           // increase resolution
126   int branch;             // branch index in [0, CNN_MAX_BRANCHES - 1], where
127                           // 0 refers to the primary branch.
128   BRANCH_COPY branch_copy_type;
129   BRANCH_COMBINE branch_combine_type;
130   struct CNN_BRANCH_CONFIG branch_config;
131   struct CNN_BATCHNORM_PARAMS
132       bn_params;   // A struct that contains the parameters
133                    // used for batch normalization.
134   int output_num;  // The output buffer idx to which the layer output is
135                    // written. Set to -1 to disable writing it to the output. In
136                    // the case that branch_combine_type is BRANCH_CAT, all
137                    // concatenated channels will be written to output. In the
138                    // case of BRANCH_ADD, the output will be the result of
139                    // summation.
140 };
141 
142 struct CNN_CONFIG {
143   int num_layers;  // number of CNN layers ( = number of hidden layers + 1)
144   int is_residue;  // whether the output activation is a residue
145   int ext_width, ext_height;  // extension horizontally and vertically
146   int strict_bounds;          // whether the input bounds are strict or not.
147                               // If strict, the extension area is filled by
148                               // replication; if not strict, image data is
149                               // assumed available beyond the bounds.
150   CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS];
151 };
152 
153 struct CNN_THREAD_DATA {
154   int num_workers;
155   AVxWorker *workers;
156 };
157 
158 struct CNN_MULTI_OUT {
159   int num_outputs;
160   const int *output_channels;
161   const int *output_strides;
162   float **output_buffer;
163 };
164 
165 // Function to return size of output
166 void av1_find_cnn_output_size(int in_width, int in_height,
167                               const CNN_CONFIG *cnn_config, int *out_width,
168                               int *out_height, int *out_channels);
169 
170 // Prediction functions from set of input image buffers. This function supports
171 // CNN with multiple outputs.
172 void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
173                                    int stride, const CNN_CONFIG *cnn_config,
174                                    const CNN_THREAD_DATA *thread_data,
175                                    struct CNN_MULTI_OUT *output);
176 void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
177                                           int stride,
178                                           const CNN_CONFIG *cnn_config,
179                                           const CNN_THREAD_DATA *thread_data,
180                                           int bit_depth, CNN_MULTI_OUT *output);
181 
182 // Prediction functions from set of input image buffers. This function only
183 // supports a single output.
184 void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
185                          const CNN_CONFIG *cnn_config,
186                          const CNN_THREAD_DATA *thread_data, float **output,
187                          int out_stride);
188 void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
189                                 int stride, const CNN_CONFIG *cnn_config,
190                                 const CNN_THREAD_DATA *thread_data,
191                                 int bit_depth, float **output, int out_stride);
192 
193 #ifdef __cplusplus
194 }  // extern "C"
195 #endif
196 
197 #endif  // AOM_AV1_COMMON_CNN_H_
198