1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_COMMON_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_COMMON_H_
17 
18 #include <algorithm>
19 #include <cstdint>
20 
21 #if defined(_MSC_VER)
22 #define __restrict__ __restrict
23 #endif
24 
25 namespace tflite {
26 
27 namespace tensor_utils {
28 
29 // Checks if all entries of vector are zero for float.
30 bool IsZeroVector(const float* vector, int v_size);
31 
32 // Checks if all entries of vector are zero for int8.
33 bool IsZeroVector(const int8_t* vector, int v_size);
34 
35 // Quantizes a buffer of floating point values using a symmetric quantization
36 // (i.e. linear quantization without an offset) to 8-bit signed integers.
37 // It also outputs the range (min, max) of the floating point buffer, and the
38 // scaling factor used to quantize the values.
39 void SymmetricQuantizeFloats(const float* values, const int size,
40                              int8_t* quantized_values, float* min_value,
41                              float* max_value, float* scaling_factor);
42 
43 // Quantizes a buffer of floating point values using a symmetric quantization
44 // (i.e. linear quantization without an offset) to 8-bit signed integers.
45 // It uses the range (min, max) provided to the function to calculate the
46 // appropriate scaling factor to quantize the values.
47 void SymmetricQuantizeFloats(const float* values, const int size,
48                              int8_t* quantized_values, float min_value,
49                              float max_value, float* scaling_factor);
50 
51 void AsymmetricQuantizeFloats(const float* values, const int size,
52                               int8_t* quantized_values, float* scaling_factor,
53                               int32_t* offset);
54 
55 // Helper function to quantize floats.
56 // float_data_ptr     input float vectors
57 // n_batch            number of input vectors
58 // n_data             size of a single input vector
59 // quantized_data_ptr (out) vector with quantized data
60 // scaling_factors    (out) scaling factors (one per vector)
61 // zero_points        (out) zero points (one per vector)
62 // do_asymmetric      controls if the quantization should be asymmetric.
BatchQuantizeFloats(const float * float_data_ptr,int n_batch,int n_data,int8_t * quantized_data_ptr,float * scaling_factors,int32_t * zero_points,bool do_asymmetric)63 inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
64                                 int n_data, int8_t* quantized_data_ptr,
65                                 float* scaling_factors, int32_t* zero_points,
66                                 bool do_asymmetric) {
67   for (int b = 0; b < n_batch; ++b) {
68     const int offset = b * n_data;
69     if (do_asymmetric) {
70       tensor_utils::AsymmetricQuantizeFloats(
71           float_data_ptr + offset, n_data, quantized_data_ptr + offset,
72           &scaling_factors[b], &zero_points[b]);
73     } else {
74       float unused_min, unused_max;
75       tensor_utils::SymmetricQuantizeFloats(
76           float_data_ptr + offset, n_data, quantized_data_ptr + offset,
77           &unused_min, &unused_max, &scaling_factors[b]);
78     }
79   }
80 }
81 
82 // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
83 // dimension composed by input vectors independent from each other). The result
84 // of the multiplication is accumulated to the passed result buffer.
85 // More specifically, for a matrix M of shape [n, i] and a batched-vector
86 // of shape [i, batch] it will first compute the product of shape [n, batch].
87 // This product will be accumulated to the result buffer.
88 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
89                                          int m_cols, const float* vector,
90                                          int n_batch, float* result);
91 
92 // Same as the function above, but the matrix is a sparse tensor with block
93 // pattern 1x4.
94 // This function assumes that m_cols is a multiple of the block size (4 in this
95 // case) so that there's no incomplete block.
96 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
97     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
98     const int32_t* __restrict__ indices, int m_rows, int m_cols,
99     const float* __restrict__ vector, int n_batch, float* __restrict__ result);
100 
101 // Same as the function above, but the matrix is stored in block compressed
102 // sparse row format with block pattern 1x16 which consists of two arrays:
103 //   1. A matrix array stores non-zero blocks of the matrix in row major.
104 //   2. A ledger array stores nrows groups, one group per row. Each group starts
105 //      with an integer representing the number of non-zero blocks for the
106 //      corresponding row and follows with column indexes of the first element
107 //      of each non-zero block.
108 // This function assumes that
109 //   1. m_cols is a multiple of 16 so that all blocks are full blocks.
110 //   2. m_cols < 254 * 16 so that block index can be represented by uint8.
111 void SparseMatrixBatchVectorMultiplyAccumulate(
112     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
113     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
114     float* __restrict__ result);
115 
116 // Same as the function above, but for values quantized using symmetric
117 // quantization (e.g. by calling SymmetricQuantizeFloats).
118 // The passed scaling factors is a buffer of the quantization scaling factors
119 // that will be used to dequentize the products into the final result buffer.
120 // These scaling factors are the multiplication of the matrix scaling factor
121 // by the vector's scaling factor, one per batch (i.e. this allows quantizing
122 // each batch in the batch-vector matrix independently).
123 void MatrixBatchVectorMultiplyAccumulate(
124     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
125     const int8_t* __restrict__ vectors,
126     const float* __restrict__ scaling_factors, int n_batch,
127     float* __restrict__ result);
128 
129 // Same as the function above except that vector values
130 // are quantized with asymmetric quantization per-batch and the matrix
131 // is quantized per row.
132 void MatrixBatchVectorMultiplyAccumulate(
133     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
134     const int8_t* __restrict__ vectors,
135     const float* __restrict__ scaling_factors, int n_batch,
136     float* __restrict__ result, const float* __restrict__ per_channel_scale,
137     const int32_t* __restrict__ input_offset);
138 
139 // Same as the function above, but the matrix is stored in block compressed
140 // sparse row format with block pattern 1x16 which consists of two arrays:
141 //   1. A matrix array stores non-zero blocks of the matrix in row major.
142 //   2. A ledger array stores nrows groups, one group per row. Each group starts
143 //      with an integer representing the number of non-zero blocks for the
144 //      corresponding row followed by column index of the first element of
145 //      each non-zero block.
146 // This function assumes that
147 //   1. m_cols is a multiple of 16 so that all blocks are full blocks.
148 //   2. m_cols < 254 * 16 so that block index can be represented by uint8.
149 void SparseMatrixBatchVectorMultiplyAccumulate(
150     const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
151     const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
152     const float* __restrict__ scaling_factors, int n_batch,
153     float* __restrict__ result);
154 
155 // Same as the above 8, 8, 8 integer matmul except for the presence of zero
156 // point and non-accumulative.
157 // TODO(b/148688698): remove this function by folding zero point calculation in
158 // prepare() function.
159 void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
160                                const int8_t* input_to_gate_weights,
161                                int32_t input_to_gate_effective_scale_a,
162                                int32_t input_to_gate_effective_scale_b,
163                                int32_t n_batch, int32_t n_input, int32_t n_cell,
164                                int8_t* gate_output, int8_t gate_output_zp);
165 
166 // Same as above but has 16 bit and 8 bit input and 8 bit output.
167 // Used in projection when hidden is 16bit.
168 void MatrixBatchVectorMultiply(const int16_t* hidden,
169                                const int8_t* hidden_to_output_weights,
170                                int32_t proj_effective_scale_a,
171                                int32_t proj_effective_scale_b,
172                                const int32_t* gate_bias, int32_t n_batch,
173                                int32_t n_hidden, int32_t n_output,
174                                int32_t output_zp, int8_t* proj_output);
175 
176 // Multiplies a matrix with a scalar and reduce the result on each row to a
177 // scalar.
178 // Parameters:
179 //     - matrix: matrix of size n_row * n_col
180 //     - scalar: the scalar that is multiplied to each element in the matrix
181 //     - n_row:  the row count of the matrix
182 //     - n_col:  the column count of the matrix
183 //     - output: the 32bit output
184 // Note: We do not need saturation because the int8 * int8 is safe from overflow
185 // in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
186 // initial output value is not exceptionally large.
187 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
188                                     int32_t n_row, int32_t n_col,
189                                     int32_t* output);
190 
191 // Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized
192 // vector.
193 // Parameters:
194 //     - input: batch vector of size n_batch * n_input; 16 bit.
195 //     - layer_norm_weights:  the quantized layer normalization weights.
196 //     - bias: the bias for the layer normalization.
197 //     - layer_norm_scale_a: multiplier for scale factor.
198 //     - layer_norm_scale_b: shift for scale factor.
199 //     - variance_limit: the guard to make sure the inverse does not overflow.
200 //     - n_batch: the number of batches.
201 //     - n_input: the size for input and output.
202 //     - output:  the 16 bit output
203 void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
204                     const int32_t* bias, int32_t layer_norm_scale_a,
205                     int32_t layer_norm_scale_b, int32_t variance_limit,
206                     int n_batch, int n_input, int16_t* output);
207 
208 // Same as above but the internal calculation is done in float.
209 void ApplyLayerNormFloat(const int16_t* input,
210                          const int16_t* layer_norm_weights,
211                          int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
212                          const int32_t* bias, int n_batch, int n_input,
213                          int16_t* output);
214 
215 // Apply Sigmoid to a quantized vector.
216 // Parameters:
217 //     - input: batch vector of size n_batch * n_input; 16 bit.
218 //     - n_batch: the number of batches.
219 //     - n_input: the size for input and output.
220 //     - output:  the 16 bit output
221 // The input is in Q3.12 format and the output is in Q0.15 format.
222 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
223                   int16_t* output);
224 
225 // Same as above but the internal calcualtion is float.
226 void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
227                        int16_t* output);
228 
229 // Apply Tanh to a quantized vector.
230 // Parameters:
231 //     - integer_bits: the integer bits of the input.
232 //                     Currently supports 0, 1, 2, 3, 4, 5, 6.
233 //     - input: batch vector of size n_batch * n_input; 16 bit.
234 //     - n_batch: the number of batches.
235 //     - n_input: the size for input and output.
236 //     - output:  the 16 bit output
237 // The input is in Qm.15-m format and the output is in Q0.15 format.
238 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
239                int32_t n_input, int16_t* output);
240 
241 // Apply Tanh to a quantized vector. Tbe internal calculation is in float.
242 //    - Input has 2^(integer_bits) as scale.
243 //    - Output has Q0.15 as scale.
244 void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
245                     int32_t integer_bits, int16_t* output);
246 
247 // Element-wise multiplication of two quantized vectors.
248 // Parameters:
249 //     - input_1: batch vector of size n_batch * n_input; 16 bit.
250 //     - input_2: batch vector of size n_batch * n_input; 16 bit.
251 //     - n_batch: the number of batches.
252 //     - n_input: the size for input and output.
253 //     - shift:   the shift needed to produce the output.
254 //     - output:  the 16 bit output of size n_batch * n_input.
255 // Output does not need to be initialized.
256 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
257               int n_input, int shift, int16_t* output);
258 
259 // Element-wise multiplication of two quantized vectors.
260 // Parameters:
261 //     - input_1: batch vector of size n_batch * n_input; 16 bit.
262 //     - input_2: batch vector of size n_batch * n_input; 16 bit.
263 //     - n_batch: the number of batches.
264 //     - n_input: the size for input and output.
265 //     - shift:   the shift needed to produce the output.
266 //     - output:  the 8 bit output of size n_batch * n_input.
267 // Output does not need to be initialized.
268 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
269               int n_input, int shift, int8_t* output);
270 
271 // Element-wise multiplication of two quantized vectors with rescaling.
272 // Parameters:
273 //     - input_1:    batch vector of size n_batch * n_input; 16 bit.
274 //     - input_2:    batch vector of size n_batch * n_input; 16 bit.
275 //     - multiplier: the multiplier part of scale.
276 //     - shift:      the shift part of scale.
277 //     - n_batch:    the number of batches.
278 //     - n_input:    the size for input and output.
279 //     - output:     the 8 bit output of size n_batch * n_input.
280 //     - output_zp:  the zero point of output.
281 // Output does not need to be initialized.
282 // Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m *
283 // 2^(s - 31).
284 void CwiseMul(const int16_t* input_1, const int16_t* input_2,
285               int32_t multiplier, int32_t shift, int32_t n_batch,
286               int32_t n_input, int32_t output_zp, int8_t* output);
287 
288 // Element-wise saturating addition of two quantized vectors without rescaling.
289 // Parameters:
290 //     - input_1:    batch vector of size n_batch * n_input; 16 bit.
291 //     - input_2:    batch vector of size n_batch * n_input; 16 bit.
292 //     - n_batch:    the number of batches.
293 //     - n_input:    the size for input and output.
294 //     - output:     the 8 bit output of size n_batch * n_input.
295 // Output does not need to be initialized.
296 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
297               int n_input, int16_t* output);
298 
299 // Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
300 // int8_t. Parameters:
301 //     - vector:         vector of size v_size.
302 //     - v_size:         the size of the vector.
303 //     - clipping_value: the value used for clipping.
304 void CwiseClipping(float* vector, const int v_size, const float clipping_value);
305 void CwiseClipping(int16_t* vector, const int v_size,
306                    const int16_t clipping_value);
307 void CwiseClipping(int8_t* vector, const int v_size,
308                    const int8_t clipping_value);
309 
310 // Cwise product of two vectors.
311 template <typename T>
VectorVectorCwiseProduct(const T * __restrict__ vector1,const T * __restrict__ vector2,int v_size,T * __restrict__ result)312 inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
313                                      const T* __restrict__ vector2, int v_size,
314                                      T* __restrict__ result) {
315   for (int v = 0; v < v_size; v++) {
316     *result++ = *vector1++ * *vector2++;
317   }
318 }
319 
320 // Cwise product and accumulate of two vectors. Since it's a MAC operation, the
321 // assumption here is that result array is initialized to valid values.
322 template <typename T>
VectorVectorCwiseProductAccumulate(const T * __restrict__ vector1,const T * __restrict__ vector2,int v_size,T * __restrict__ result)323 inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
324                                                const T* __restrict__ vector2,
325                                                int v_size,
326                                                T* __restrict__ result) {
327   for (int v = 0; v < v_size; v++) {
328     *result++ += *vector1++ * *vector2++;
329   }
330 }
331 
332 // Dot product of two vectors.
333 float VectorVectorDotProduct(const float* vector1, const float* vector2,
334                              int v_size);
335 
336 // Dot product of two batch vectors of size n_batch * v_size:
337 // vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
338 //            x_2_1, x_2_2, ..., x_2_vsize,
339 //            ...
340 //            x_nbatch_1,..., x_nbatch_vsize]
341 // vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
342 //            y_2_1, y_2_2, ..., y_2_vsize,
343 //            ...
344 //            y_nbatch_1,..., y_nbatch_vsize]
345 // Then result will be a vector of n_batch size starting from 'result':
346 // [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
347 //  x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
348 //  ...
349 //  x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
350 template <typename T>
BatchVectorBatchVectorDotProduct(const T * vector1,const T * vector2,int v_size,int n_batch,T * result)351 inline void BatchVectorBatchVectorDotProduct(const T* vector1, const T* vector2,
352                                              int v_size, int n_batch,
353                                              T* result) {
354   for (int b = 0; b < n_batch; b++) {
355     result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
356     vector1 += v_size;
357     vector2 += v_size;
358   }
359 }
360 
361 // Same as above but input is 16bit and output is 32bit.
362 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
363                                       const int16_t* vector2, int v_size,
364                                       int n_batch, int32_t* result);
365 
366 // Cwise product of a vector and a batch-vector.
367 template <typename T>
VectorBatchVectorCwiseProduct(const T * vector,int v_size,const T * batch_vector,int n_batch,T * result)368 inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size,
369                                           const T* batch_vector, int n_batch,
370                                           T* result) {
371   for (int b = 0; b < n_batch; b++) {
372     VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
373     // Update the pointers.
374     result += v_size;
375     batch_vector += v_size;
376   }
377 }
378 
379 // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
380 // operation, the assumption here is that result array is initialized to valid
381 // values.
382 template <typename T>
VectorBatchVectorCwiseProductAccumulate(const T * vector,int v_size,const T * batch_vector,int n_batch,T * result)383 inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size,
384                                                     const T* batch_vector,
385                                                     int n_batch, T* result) {
386   for (int b = 0; b < n_batch; b++) {
387     VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
388     // Update the pointers.
389     result += v_size;
390     batch_vector += v_size;
391   }
392 }
393 
394 // Same as above, but inputs are 16bit integer and output is 16bit integer.
395 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
396                                              const int16_t* batch_vector,
397                                              int n_batch, int32_t multiplier,
398                                              int shift, int16_t* result);
399 
400 // Add another vector for each batch in the batch vector.
401 template <typename T>
VectorBatchVectorAdd(const T * vector,int v_size,int n_batch,T * batch_vector)402 void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
403                           T* batch_vector) {
404   for (int b = 0; b < n_batch; b++) {
405     for (int i = 0; i < v_size; ++i) {
406       batch_vector[i] += vector[i];
407     }
408     batch_vector += v_size;
409   }
410 }
411 
412 // Batch vector initialization with another vector.
413 template <typename T>
VectorBatchVectorAssign(const T * vector,int v_size,int n_batch,T * batch_vector)414 void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
415                              T* batch_vector) {
416   for (int b = 0; b < n_batch; b++) {
417     std::copy_n(vector, v_size, batch_vector + b * v_size);
418   }
419 }
420 
421 // Compute "1.0f - elements of vector" (used in CIFG).
422 void Sub1Vector(const float* vector, int v_size, float* result);
423 
424 // Compute "1.0f - elements of vector" (used in CIFG) for int16 input.
425 // "vector" has range [0, 32767] because it is the output of sigmoid function.
426 void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
427 
428 // Multiply all elements of vector with a scalar.
429 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
430                           float* result);
431 
432 // Reduce-sum on a float input vector:
433 // input_vector: float pointer to input vector.
434 // output_vector: float pointer to vector.
435 // output_size: output vector size.
436 // reduction_size: number of consecutive elements from input vector which are
437 // added to get one element of output.
438 void ReductionSumVector(const float* input_vector, float* output_vector,
439                         int output_size, int reduction_size);
440 
441 // Same as above but input/output is 32 bit integer.
442 void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
443                         int output_size, int reduction_size);
444 
445 // Same as above but input is 8 bit integer.
446 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
447                         int output_size, int reduction_size);
448 
449 // Layer norm for each batch.
450 void MeanStddevNormalization(const float* __restrict__ input_vector,
451                              float* __restrict__ output_vector, int v_size,
452                              int n_batch);
453 
454 // Saturate Add with rescale on both inputs.
455 void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
456                           const int8_t* recurrent, int8_t recurrent_zp,
457                           int32_t input_effective_scale_a,
458                           int32_t input_effective_scale_b,
459                           int32_t recurrent_effective_scale_a,
460                           int32_t recurrent_effective_scale_b, int32_t n_batch,
461                           int32_t n_cell, int16_t* output);
462 
463 }  // namespace tensor_utils
464 }  // namespace tflite
465 
466 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_COMMON_H_
467