1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
17 
18 #include <algorithm>
19 #include <cmath>
20 #include <cstdint>
21 
22 #include "Eigen/Core"
23 #include "tensorflow/lite/c/builtin_op_data.h"
24 #include "tensorflow/lite/kernels/internal/tensor_utils_common.h"
25 
26 #if defined(_MSC_VER)
27 #define __restrict__ __restrict
28 #endif
29 
30 namespace tflite {
31 
32 // Not all backends support CpuBackendContext usage, so forward declare to avoid
33 // pulling in its implementation. Use of CpuBackendContext in method
34 // implementations is purely optional.
35 class CpuBackendContext;
36 
37 namespace tensor_utils {
38 
39 // Same as the function above, but provide a scratch buffer for the
40 // int8 x int8 -> int32 and a CpuBackendContext for the accumulator
41 // computation.
42 void MatrixBatchVectorMultiplyAccumulate(
43     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
44     const int8_t* __restrict__ vectors,
45     const float* __restrict__ scaling_factors, int n_batch,
46     int32_t* __restrict__ scratch, float* __restrict__ result,
47     CpuBackendContext* __restrict__ context);
48 
49 // Same as the function above except that can make use of cached row sums.
50 void MatrixBatchVectorMultiplyAccumulate(
51     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
52     const int8_t* __restrict__ vectors, const float* scaling_factors,
53     int n_batch, float* __restrict__ result, const float* per_channel_scale,
54     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
55     bool* compute_row_sums, CpuBackendContext* context);
56 
57 // Same as the function above, but provides separate scaling factor for the
58 // matrix and the vectors. The scaling factors are multiplied in the
59 // scaling_factor_scratch buffer.
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float matrix_scaling_factor,const float * vector_scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,float * scaling_factor_scratch,CpuBackendContext * context)60 inline void MatrixBatchVectorMultiplyAccumulate(
61     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
62     const int8_t* __restrict__ vectors, const float matrix_scaling_factor,
63     const float* vector_scaling_factors, int n_batch,
64     float* __restrict__ result, const float* per_channel_scale,
65     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
66     bool* compute_row_sums, float* scaling_factor_scratch,
67     CpuBackendContext* context) {
68   for (int b = 0; b < n_batch; ++b) {
69     scaling_factor_scratch[b] =
70         vector_scaling_factors[b] * matrix_scaling_factor;
71   }
72   MatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
73                                       scaling_factor_scratch, n_batch, result,
74                                       per_channel_scale, input_offset, scratch,
75                                       row_sums, compute_row_sums, context);
76 }
77 
78 // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
79 // dimension composed by input vectors independent from each other). The result
80 // of the multiplication is accumulated to the passed result buffer.
81 // More specifically, for a matrix M of shape [n, i] and a batched-vector
82 // of shape [i, batch] it will first compute the product of shape [n, batch].
83 // This product will be accumulated to the result buffer,
84 // Parameters:
85 //     - input: batch vector of size n_batch * n_input
86 //     - bias:  vector of size b_input
87 //     - input_to_gate_weights: matrix of size n_input * n_output
88 //     - multiplier: scalar
89 //     - shift: scalar
90 //     - n_batch: the batch size
91 //     - n_input: the input size
92 //     - n_output: the output size
93 //     - output_zp: the zero point of the output.
94 //     - scratch: batch vector of size n_batch * n_output
95 //     - output: the 16 bit output
96 // Notes:
97 //     - this is used for gate matmul: for non-cifg it is for input, forget,
98 //       cell, output gates; for cifg, it is for forget, cell, output gates.
99 //     - multiplier and shift combined gives the scale.
100 //     - assumes input zero point is 0.
101 //     - scratch is created for optimization purpose only.
102 // TODO(b/152066492): this can be removed if some future optimization
103 // work makes it unnecessary.
104 void MatrixBatchVectorMultiplyAccumulate(
105     const int8_t* input, const int32_t* bias,
106     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
107     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
108     int32_t* scratch, int16_t* output, CpuBackendContext* context);
109 
110 // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
111 // dimension composed by input vectors independent from each other). The result
112 // of the multiplication is accumulated to the passed result buffer.
113 // More specifically, for a matrix M of shape [n, i] and a batched-vector
114 // of shape [i, batch] it will first compute the product of shape [n, batch].
115 // This product will be accumulated to the result buffer,
116 // Parameters:
117 //     - input: batch vector of size n_batch * n_input
118 //     - bias:  vector of size b_input
119 //     - input_to_gate_weights: matrix of size n_input * n_output
120 //     - multiplier: scalar
121 //     - shift: scalar
122 //     - n_batch: the batch size
123 //     - n_input: the input size
124 //     - n_output: the output size
125 //     - output_zp: the zero point of the output.
126 //     - scratch: batch vector of size n_batch * n_output
127 //     - output: the 8 bit output
128 // Notes:
129 //     - this is used for projection matmul.
130 //     - multiplier and shift combined gives the scale.
131 //     - assumes input zero point is 0.
132 //     - scratch is created for optimization purpose only.
133 // TODO(b/152066492): this can be removed if some future optimization
134 // work makes it unnecessary.
135 void MatrixBatchVectorMultiplyAccumulate(
136     const int8_t* input, const int32_t* bias,
137     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
138     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
139     int32_t* scratch, int8_t* output, CpuBackendContext* context);
140 
141 // Apply Rectified Linear to elements of a vector.
ApplyReluToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)142 inline void ApplyReluToVector(const float* __restrict__ vector, int v_size,
143                               float* __restrict__ result) {
144   for (int v = 0; v < v_size; v++) {
145     result[v] = std::max(0.0f, vector[v]);
146   }
147 }
148 
149 // Apply Rectified Linear 1 (cap to [-1;1]) to elements of a vector
ApplyRelu1ToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)150 inline void ApplyRelu1ToVector(const float* __restrict__ vector, int v_size,
151                                float* __restrict__ result) {
152   for (int v = 0; v < v_size; v++) {
153     result[v] = std::max(-1.0f, std::min(vector[v], 1.0f));
154   }
155 }
156 
157 // Apply Rectified Linear 6 (cap to [0;6]) to elements of a vector
ApplyRelu6ToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)158 inline void ApplyRelu6ToVector(const float* __restrict__ vector, int v_size,
159                                float* __restrict__ result) {
160   for (int v = 0; v < v_size; v++) {
161     result[v] = std::max(0.0f, std::min(vector[v], 6.0f));
162   }
163 }
164 
165 // Apply tanh to elements of a vector
ApplyTanhToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)166 inline void ApplyTanhToVector(const float* __restrict__ vector, int v_size,
167                               float* __restrict__ result) {
168   using VectorMap = Eigen::Map<Eigen::Vector<float, Eigen::Dynamic>>;
169   VectorMap input_map(const_cast<float* __restrict__>(vector), v_size);
170   VectorMap output_map(result, v_size);
171   output_map.array() = input_map.array().tanh();
172 }
173 
174 // Apply signbit to elements of a vector
ApplySignbitToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)175 inline void ApplySignbitToVector(const float* __restrict__ vector, int v_size,
176                                  float* __restrict__ result) {
177   for (int v = 0; v < v_size; v++) {
178     result[v] = std::signbit(vector[v]);
179   }
180 }
181 
182 // Apply sigmoid to elements of a vector.
ApplySigmoidToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)183 inline void ApplySigmoidToVector(const float* __restrict__ vector, int v_size,
184                                  float* __restrict__ result) {
185   using VectorMap = Eigen::Map<Eigen::Vector<float, Eigen::Dynamic>>;
186   VectorMap input_map(const_cast<float* __restrict__>(vector), v_size);
187   VectorMap output_map(result, v_size);
188   output_map.array() = input_map.array().logistic();
189 }
190 
191 // Apply appropriate activation function to elements of a vector.
ApplyActivationToVector(const float * __restrict__ vector,int v_size,TfLiteFusedActivation activation,float * __restrict__ result)192 inline void ApplyActivationToVector(const float* __restrict__ vector,
193                                     int v_size,
194                                     TfLiteFusedActivation activation,
195                                     float* __restrict__ result) {
196   switch (activation) {
197     case kTfLiteActNone:
198       return;
199     case kTfLiteActRelu:
200       return ApplyReluToVector(vector, v_size, result);
201     case kTfLiteActReluN1To1:
202       return ApplyRelu1ToVector(vector, v_size, result);
203     case kTfLiteActRelu6:
204       return ApplyRelu6ToVector(vector, v_size, result);
205     case kTfLiteActTanh:
206       return ApplyTanhToVector(vector, v_size, result);
207     case kTfLiteActSignBit:
208       return ApplySignbitToVector(vector, v_size, result);
209     case kTfLiteActSigmoid:
210       return ApplySigmoidToVector(vector, v_size, result);
211   }
212 }
213 
214 }  // namespace tensor_utils
215 }  // namespace tflite
216 
217 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
218