1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Internal types and helpers for matrices.
17 // "Mat" is the name we use to refer to our internal matrix classes; it can be
18 // thought of as a shorter version of "InternalMatrix"`
19 //
20 // Ruy has four internal matrix classes, besides the
21 // Matrix<T> class that we expose to the user-facing API.
22 //
23 // TODO(silvasean): Put parts of this architecture description somewhere more
24 // prominent.
25 //
26 // The 4 internal matrix classes are named Mat, EMat, PMat, PEMat, where:
27 // - "E" indicates a type-erased class, storing a void* pointer and a runtime
28 //   enum value to track the scalar type, as opposed to being templatized
29 //   on a Scalar type and storing a Scalar* pointer.
30 // - "P" indicates a packed matrix class, the output of the packing code and
31 //   input of the kernel code. See comments in pack.h regarding packing.
32 //
33 // In other words:
34 //
35 //                Plain matrices   Packed matrices
36 //             +----------------------------------
37 // Templated   |  Mat, Matrix      PMat
38 // Type-erased |  EMat             PEMat
39 //
40 // Note that Matrix<T> is *not* implemented in terms of the internal types. It
41 // is an independent, simple, and user-facing type. Matrix<T> is functionally
42 // equivalent to Mat, but we keep it separate to insulate internals from
43 // interface and to be able to make different compromises in internals
44 // vs interface: in internals we prefer Mat to be a C-style struct with
45 // raw data member access and to be similar to the other PMat/EMat/PEMat
46 // classes for consistency.
47 //
48 // The use of type-erasure might seem surprising for a library like Ruy with a
49 // heavily-templated entry point, but it is motivated by the desire for most of
50 // Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
51 // main parts:
52 // - "entry-point" (ruy.h) - this is the highly templated ruy::Mul entry
53 // point.
54 // - "front-end" (frontend.*, validate.*, create_trmul_params.*,
55 // prepare_packed_matrices.*) - the work to handle the entry-point call down to
56 // the point where it can be handed off to the middle/back ends below. That
57 // includes routines that select RunKernel and RunPack
58 // implementations statically based on those template parameters.
59 // - "back-end" (kernel_*.*, pack_*.*)- this consists of the implementations of
60 // RunKernel and RunPack, often in assembly code, which are the building blocks
61 // that Ruy calls to perform matrix multiplication.  These are templated so that
62 // only the requested types/Path's are actually emitted by the compiler.
63 // - "middle-end" (trmul.*) - this is the part of Ruy that orchestrates the
64 // calls to the "back-end" optimized building blocks. This layer has to deal
65 // with issues like cache locality and low-overhead multi-threading.
66 //
67 // There is a desire for the "middle-end" to be non-templated in order to
68 // simplify the implementation and reduce code-size. We type-erase when going
69 // from the "front-end" to the "middle-end", and un-type-erase going from the
70 // "middle-end" to the "back-end". The un-type-erasure is possible because the
71 // "front-end" is responsible for instantiating the needed "back-end" templates,
72 // and thus the static type information is still present.
73 //
74 // Each layer of Ruy uses matrix types:
75 // - "entry-point": Matrix<T>
76 // - "front-end": Mat
77 // - "middle-end": EMat, PEMat
78 // - "back-end": Mat, PMat
79 //
80 // The use of separate types for packed matrices is not essential, but makes it
81 // obvious at a glance whether a matrix is a packed matrix or not. We would
82 // reconsider this decision if there was significant duplication between packed
83 // and unpacked matrices, but that doesn't seem to be the case at the moment.
84 //
85 // Another goal is to keep the user-facing Matrix<T> as simple and
86 // understandable as possible. Ideally, a user should be able to read the struct
87 // definition for Matrix<T> and see a very simple definition with no internal
88 // details like sums and kernel block layout.
89 
90 #ifndef RUY_RUY_INTERNAL_MATRIX_H_
91 #define RUY_RUY_INTERNAL_MATRIX_H_
92 
93 #include <cstddef>
94 #include <cstdint>
95 #include <type_traits>
96 #include <utility>
97 
98 #include "ruy/check_macros.h"
99 #include "ruy/matrix.h"
100 #include "ruy/size_util.h"
101 
102 namespace ruy {
103 
104 // Internal counterpart of Layout, used by Mat.
105 struct MatLayout final {
106   std::int32_t rows = 0;
107   std::int32_t cols = 0;
108   // Stride is the offset between two adjacent matrix elements
109   // in the non-contiguous direction.
110   std::int32_t stride = 0;
111   Order order = Order::kColMajor;
112 };
113 
ToInternal(const Layout & src)114 inline MatLayout ToInternal(const Layout& src) {
115   MatLayout ret;
116   ret.rows = src.rows();
117   ret.cols = src.cols();
118   ret.stride = src.stride();
119   ret.order = src.order();
120   return ret;
121 }
122 
123 // Internal counterpart of Matrix
124 template <typename Scalar>
125 struct Mat final {
126   detail::ConstCheckingPtr<Scalar> data;
127   MatLayout layout;
128   Scalar zero_point = 0;
129   CachePolicy cache_policy = CachePolicy::kNeverCache;
130 };
131 
132 template <typename Scalar>
ToInternal(const Matrix<Scalar> & src)133 inline Mat<Scalar> ToInternal(const Matrix<Scalar>& src) {
134   Mat<Scalar> ret;
135   ret.data.set(src.data());
136   ret.layout = ToInternal(src.layout());
137   ret.zero_point = src.zero_point();
138   ret.cache_policy = src.cache_policy();
139   return ret;
140 }
141 
142 template <typename Scalar>
ToInternal(Matrix<Scalar> & src)143 inline Mat<Scalar> ToInternal(Matrix<Scalar>& src) {
144   Mat<Scalar> ret;
145   ret.data.set(src.data());
146   ret.layout = ToInternal(src.layout());
147   ret.zero_point = src.zero_point();
148   ret.cache_policy = src.cache_policy();
149   return ret;
150 }
151 
152 // KernelLayout describes small-scale block structure in a packed matrix layout.
153 // It's a runtime (as opposed to compile-time-constant) version of the
154 // FixedKernelLayout struct used to declare kernel layouts.
155 //
156 // This is is sometimes known as "tiling" in other contexts.
157 //
158 // For example, consider a packed matrix in column-major format with a
159 // column-major KernelLayout. The matrix logically has a shape of
160 // `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
161 // of shape `[cols / kcols, rows / krows, kcols, krows]`.
162 //
163 // Note that in the case of kcols=1, krows=1, this degenerates to
164 // `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
165 // structure.
166 struct KernelLayout final {
167   Order order = Order::kColMajor;
168   std::uint8_t rows = 1;
169   std::uint8_t cols = 1;
170 };
171 
172 // A packed matrix has a small-scale block structure that is not present in in
173 // the input matrices. This block structure is necessary for the kernels to
174 // process data efficiently.
175 //
176 // This struct is very similar to MatLayout, but has the extra KernelLayout
177 // field.
178 struct PMatLayout final {
179   std::int32_t rows = 0;
180   std::int32_t cols = 0;
181   // Stride is the offset between two adjacent matrix elements
182   // in the non-contiguous direction.
183   std::int32_t stride = 0;
184   Order order = Order::kColMajor;
185   // Small scale layout shuffling, potentially departing from
186   // linear row-major or column-major storage. See KernelLayout.
187   KernelLayout kernel;
188 };
189 
190 inline bool operator==(const PMatLayout& a, const PMatLayout& b) {
191   return a.cols == b.cols && a.rows == b.rows && a.stride == b.stride &&
192          a.order == b.order && a.kernel.rows == b.kernel.rows &&
193          a.kernel.cols == b.kernel.cols && a.kernel.order == b.kernel.order;
194 }
195 
196 // Dynamic representation for a type.
197 //
198 // The most important field in this struct is the size, which Ruy uses to know
199 // how much memory to allocate without having to be templated on a type.
200 // Signed-ness and floating-point-ness are mainly present as debugging checks.
201 //
202 // Note: Ruy does not use this struct to to dynamically dispatch between
203 // different typed implementations. As described in the comment at the top of
204 // this file, Ruy's "front-end", which is templated, instantiates all the
205 // necessary "back-end" routines with complete static knowledge of all the
206 // types.
207 struct Type final {
208   template <typename T>
Createfinal209   static Type Create() {
210     Type ret;
211     ret.is_signed = std::is_signed<T>::value;
212     ret.is_floating_point = std::is_floating_point<T>::value;
213     ret.size = sizeof(T);
214     return ret;
215   }
216 
217   template <typename T>
AssertIsfinal218   void AssertIs() const {
219     RUY_DCHECK_EQ(is_signed, Create<T>().is_signed);
220     RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point);
221     RUY_DCHECK_EQ(size, Create<T>().size);
222   }
223 
224   bool is_signed = false;
225   bool is_floating_point = false;
226   std::uint8_t size = 0;
227 };
228 
229 inline bool operator==(const Type& type1, const Type& type2) {
230   return type1.is_signed == type2.is_signed &&
231          type1.is_floating_point == type2.is_floating_point &&
232          type1.size == type2.size;
233 }
234 
235 // Type-erased matrix.
236 struct EMat final {
237   Type data_type;
238   void* data = nullptr;
239   MatLayout layout;
240   std::int32_t zero_point = 0;
241   CachePolicy cache_policy = CachePolicy::kNeverCache;
242 };
243 
244 // Type-erased packed matrix.
245 struct PEMat final {
246   Type data_type;
247   void* data = nullptr;
248   Type sums_type;
249   void* sums = nullptr;
250   PMatLayout layout;
251   std::int32_t zero_point = 0;
252 };
253 
254 // Convenient typed helper for packed matrices.
255 template <typename Scalar>
256 struct PMat final {
257   // The row/column sums needed for quantized matrix multiplication when
258   // the opposite operand of the multiplication uses a non-symmetric zero
259   // point.
260   // This member is only relevant for packed matrices.
261   // Additionally, Ruy always uses 32-bit signed accumulators for quantized
262   // matrix multiplication.
263   // For floating point types, there is no quantization, so this pointer
264   // will always be null. We still need code referencing it to compile
265   // though, even if it is always branched around. Hence we use Scalar*
266   // itself as the type in that case.
267   using SumsType =
268       typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
269                                 std::int32_t>::type;
270 
271   Scalar* data = nullptr;
272   SumsType* sums = nullptr;
273   PMatLayout layout;
274   std::int32_t zero_point = 0;
275 };
276 
277 template <typename T>
EraseType(const Mat<T> & matrix)278 EMat EraseType(const Mat<T>& matrix) {
279   EMat ret;
280   ret.data_type = Type::Create<T>();
281   ret.data = const_cast<void*>(static_cast<const void*>(matrix.data.get()));
282   ret.layout = matrix.layout;
283   ret.zero_point = matrix.zero_point;
284   ret.cache_policy = matrix.cache_policy;
285   return ret;
286 }
287 
288 template <typename T>
UneraseType(const EMat & matrix)289 Mat<T> UneraseType(const EMat& matrix) {
290   matrix.data_type.AssertIs<T>();
291   Mat<T> ret;
292   ret.data.set(static_cast<T*>(matrix.data));
293   ret.layout = matrix.layout;
294   ret.zero_point = matrix.zero_point;
295   ret.cache_policy = matrix.cache_policy;
296   return ret;
297 }
298 
299 template <typename T>
UneraseType(const PEMat & matrix)300 PMat<T> UneraseType(const PEMat& matrix) {
301   using SumsType = typename PMat<T>::SumsType;
302   matrix.data_type.AssertIs<T>();
303   matrix.sums_type.AssertIs<SumsType>();
304   PMat<T> ret;
305   ret.data = static_cast<T*>(matrix.data);
306   ret.sums = static_cast<SumsType*>(matrix.sums);
307   ret.layout = matrix.layout;
308   ret.zero_point = matrix.zero_point;
309   return ret;
310 }
311 
312 // Helpers for MatLayout / PMatLayout.
313 
IsUnstrided(const MatLayout & layout)314 inline bool IsUnstrided(const MatLayout& layout) {
315   if (layout.order == Order::kColMajor) {
316     return layout.stride == layout.rows;
317   } else {
318     return layout.stride == layout.cols;
319   }
320 }
321 
IsRowMajor(const MatLayout & layout)322 inline bool IsRowMajor(const MatLayout& layout) {
323   return layout.order == Order::kRowMajor;
324 }
325 
IsColMajor(const MatLayout & layout)326 inline bool IsColMajor(const MatLayout& layout) {
327   return layout.order == Order::kColMajor;
328 }
329 
FlatSize(const MatLayout & layout)330 inline int FlatSize(const MatLayout& layout) {
331   const int outerdim =
332       layout.order == Order::kColMajor ? layout.cols : layout.rows;
333   return layout.stride * outerdim;
334 }
335 
IsUnstrided(const PMatLayout & layout)336 inline bool IsUnstrided(const PMatLayout& layout) {
337   if (layout.order == Order::kColMajor) {
338     return layout.stride == layout.rows;
339   } else {
340     return layout.stride == layout.cols;
341   }
342 }
343 
IsRowMajor(const PMatLayout & layout)344 inline bool IsRowMajor(const PMatLayout& layout) {
345   return layout.order == Order::kRowMajor;
346 }
347 
IsColMajor(const PMatLayout & layout)348 inline bool IsColMajor(const PMatLayout& layout) {
349   return layout.order == Order::kColMajor;
350 }
351 
FlatSize(const PMatLayout & layout)352 inline int FlatSize(const PMatLayout& layout) {
353   const int outerdim =
354       layout.order == Order::kColMajor ? layout.cols : layout.rows;
355   return layout.stride * outerdim;
356 }
357 
358 // TODO(b/130417400) add a unit test
Offset(const MatLayout & layout,int row,int col)359 inline int Offset(const MatLayout& layout, int row, int col) {
360   // TODO(benoitjacob)  - should check this but this make the _slow tests take
361   // 5x longer.  Find a mitigation like in Eigen with an 'internal' variant
362   // bypassing the check?
363   // RUY_DCHECK_GE(row, 0);
364   // RUY_DCHECK_GE(col, 0);
365   // RUY_DCHECK_LT(row, layout.rows);
366   // RUY_DCHECK_LT(col, layout.cols);
367   int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
368   int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
369   return row * row_stride + col * col_stride;
370 }
371 
372 // TODO(b/130417400) add a unit test
Offset(const PMatLayout & layout,int row,int col)373 inline int Offset(const PMatLayout& layout, int row, int col) {
374   RUY_DCHECK(is_pot(layout.kernel.rows));
375   RUY_DCHECK(is_pot(layout.kernel.cols));
376   int row_outer = row & ~(layout.kernel.rows - 1);
377   int col_outer = col & ~(layout.kernel.cols - 1);
378   int row_stride_outer =
379       layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
380   int col_stride_outer =
381       layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
382   int offset_outer =
383       row_outer * row_stride_outer + col_outer * col_stride_outer;
384   int row_inner = row - row_outer;
385   int col_inner = col - col_outer;
386   int row_stride_inner =
387       layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
388   int col_stride_inner =
389       layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
390   int offset_inner =
391       row_inner * row_stride_inner + col_inner * col_stride_inner;
392   return offset_outer + offset_inner;
393 }
394 
395 // Helpers for Mat<T>.
396 
397 template <typename Scalar>
ElementPtr(const Mat<Scalar> & mat,int row,int col)398 const Scalar* ElementPtr(const Mat<Scalar>& mat, int row, int col) {
399   return mat.data.get() + Offset(mat.layout, row, col);
400 }
401 
402 template <typename Scalar>
ElementPtr(Mat<Scalar> * mat,int row,int col)403 Scalar* ElementPtr(Mat<Scalar>* mat, int row, int col) {
404   return mat->data.get() + Offset(mat->layout, row, col);
405 }
406 
407 template <typename Scalar>
Element(const Mat<Scalar> & mat,int row,int col)408 Scalar Element(const Mat<Scalar>& mat, int row, int col) {
409   return *ElementPtr(mat, row, col);
410 }
411 
412 // Helpers for PMat<T>.
413 // Duplicated from Matrix<T>, but the duplication seems acceptable.
414 
415 template <typename Scalar>
ElementPtr(const PMat<Scalar> & mat,int row,int col)416 const Scalar* ElementPtr(const PMat<Scalar>& mat, int row, int col) {
417   return mat.data + Offset(mat.layout, row, col);
418 }
419 
420 template <typename Scalar>
ElementPtr(PMat<Scalar> * mat,int row,int col)421 Scalar* ElementPtr(PMat<Scalar>* mat, int row, int col) {
422   return mat->data + Offset(mat->layout, row, col);
423 }
424 
425 template <typename Scalar>
Element(const PMat<Scalar> & mat,int row,int col)426 Scalar Element(const PMat<Scalar>& mat, int row, int col) {
427   return *ElementPtr(mat, row, col);
428 }
429 
430 // Helpers for PEMat.
431 
DataBytes(const PEMat & packed)432 inline int DataBytes(const PEMat& packed) {
433   return FlatSize(packed.layout) * packed.data_type.size;
434 }
435 
SumsBytes(const PEMat & packed)436 inline int SumsBytes(const PEMat& packed) {
437   // Packed matrices are only relevant for Ruy's TrMul implementations. For
438   // TrMul, the number of sums is always equal to the number of columns.
439   return packed.layout.cols * packed.sums_type.size;
440 }
441 
442 // Transpose helpers.
443 
Transpose(Order order)444 inline Order Transpose(Order order) {
445   return order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
446 }
447 
Transpose(const MatLayout & layout)448 inline MatLayout Transpose(const MatLayout& layout) {
449   MatLayout result(layout);
450   result.order = Transpose(result.order);
451   std::swap(result.rows, result.cols);
452   return result;
453 }
454 
455 template <typename Scalar>
Transpose(const Mat<Scalar> & matrix)456 Mat<Scalar> Transpose(const Mat<Scalar>& matrix) {
457   Mat<Scalar> result(matrix);
458   result.layout = Transpose(result.layout);
459   return result;
460 }
461 
462 // Compile-time version of KernelLayout, used to declare kernel layouts in a
463 // way that can be consumed by compile-time logic.
464 template <Order tOrder, int tRows, int tCols>
465 struct FixedKernelLayout {
466   static constexpr Order kOrder = tOrder;
467   static constexpr int kRows = tRows;
468   static constexpr int kCols = tCols;
469 };
470 
471 template <typename FixedKernelLayout>
ToKernelLayout()472 KernelLayout ToKernelLayout() {
473   KernelLayout ret;
474   ret.order = FixedKernelLayout::kOrder;
475   ret.rows = FixedKernelLayout::kRows;
476   ret.cols = FixedKernelLayout::kCols;
477   return ret;
478 }
479 
480 #if (__cplusplus < 201703L)
481 // A static constexpr data member is automatically inline and should not require
482 // redeclaration without an initializer. This is actually deprecated from C++17
483 // onwards. Clang with -O0 without this can fail to link.
484 template <Order tOrder, int tRows, int tCols>
485 constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kCols;
486 template <Order tOrder, int tRows, int tCols>
487 constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kRows;
488 #endif
489 
490 }  // namespace ruy
491 
492 #endif  // RUY_RUY_INTERNAL_MATRIX_H_
493