1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // pack.h: packing blocks of the LHS and RHS into the data layout
16 // that is expected by compute.h and eventually by kernels.
17 // Because this data layout depends on the kernel format, code here
18 // is templated in KernelLhsFormat/KernelRhsFormat.
19 //
20 // Readers note: an important theme around here is that we try hard
21 // to handle both Lhs and Rhs with a single piece of code. We indifferently
22 // refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
23 // by (row, column) indices, we address them by (width, depth), as explained
24 // in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
25 // at once.
26 
27 #ifndef GEMMLOWP_INTERNAL_PACK_H_
28 #define GEMMLOWP_INTERNAL_PACK_H_
29 
30 #include <cstring>
31 
32 #include "allocator.h"
33 #include "block_params.h"
34 #include "common.h"
35 #include "kernel.h"
36 
37 namespace gemmlowp {
38 
39 // A PackedSideBlock instance is a packed block of either the LHS or RHS
40 // (whence the generic 'Side' name).
41 //
42 // 'Packed' means that it is laid out in the storage order that
43 // is expected by the specified kernel format. From a block of the input
44 // LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
45 // or PackRhs().
46 template <typename tKernelSideFormat>
47 class PackedSideBlock {
48  public:
49   typedef tKernelSideFormat KernelSideFormat;
50 
PackedSideBlock(Side side,Allocator * allocator,const BlockParams & block_params)51   PackedSideBlock(Side side, Allocator* allocator,
52                   const BlockParams& block_params)
53       : allocator_(allocator), pos_(0) {
54     GetSideBlockParams(side, &params_, block_params);
55     data_handle_ =
56         allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
57     sums_of_each_slice_handle_ =
58         allocator_->Reserve<std::int32_t>(params_.l2_width);
59   }
60 
~PackedSideBlock()61   ~PackedSideBlock() {}
62 
seek_run(int start_width,int start_depth)63   void seek_run(int start_width, int start_depth) const {
64     int kernel_run_depth =
65         std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
66     pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
67   }
68 
seek_next_cell()69   void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
70 
seek_forward_n_cells(int n)71   void seek_forward_n_cells(int n) const {
72     pos_ += n * KernelSideFormat::Cell::kSize;
73   }
74 
75   // TODO(suharshs): The datatype can now be int8 as well. We could introduce a
76   // new int8 current_data impl as well. This change would propagate to all pack
77   // impls and the Kernel::Run API, which all assume uint8. For now we leave
78   // this as-is pending future refactor.
current_data()79   const std::uint8_t* current_data() const {
80     return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
81   }
82 
current_data()83   std::uint8_t* current_data() {
84     return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
85   }
86 
sums_of_each_slice()87   std::int32_t* sums_of_each_slice() {
88     return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
89   }
90 
sums_of_each_slice()91   const std::int32_t* sums_of_each_slice() const {
92     return allocator_->GetPointer<const std::int32_t>(
93         sums_of_each_slice_handle_);
94   }
95 
params()96   const SideBlockParams& params() const { return params_; }
97 
98  private:
99   // The block size parameters that this PackedSizeBlock follows.
100   // The L2 parameters determine its overall size, while the L1 parameters,
101   // together with the kernel format template parameter, determine
102   // the fine details of the storage/traversal order.
103   SideBlockParams params_;
104 
105   // Pointer to the allocator provided by the caller. Not owned.
106   // The Allocator is assumed to outlive the PackedSideBlock.
107   Allocator* const allocator_;
108 
109   // Handle on the buffer backing this packed block. Owned.
110   Allocator::Handle data_handle_;
111 
112   // Handle on the additional buffer backing the vector of sums of slices
113   // associated with this block. Owned.
114   Allocator::Handle sums_of_each_slice_handle_;
115 
116   // pos_ is the current position in the buffer, which we access
117   // sequentially, like a file.
118   // The idea is that we pack data in the same order as it is
119   // going to be traversed during the computation, which for
120   // cache-friendliness reasons is complicated to random-access,
121   // as the offsets calculations would be intricate. So we
122   // give up random-access addressing, and instead content ourselves
123   // with sequential access.
124   //
125   // pos_ is mutable because during the computation we will want to
126   // be able to iterate on the data in a const PackedSideBlock.
127   mutable int pos_;
128 };
129 
130 // WidthMajor and DepthMajor are custom phrases modelled after the
131 // standard terminology 'row-major' and 'column-major'. Their meaning
132 // should be transparent once one has read the explanation in kernel.h:
133 // for example, in the Lhs, the 'width' dimension is the rows dimension,
134 // so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
135 // Another way to put it: WidthMajor means that contiguous storage is used
136 // for entries having the same 'width' index.
137 enum class SideMapOrder { WidthMajor, DepthMajor };
138 
139 // Similar to MatrixMap from map.h, but in terms of width/depth instead of
140 // rows/columns. Used to address blocks of the input LHS/RHS matrices when
141 // packing them.
142 template <typename tScalar, SideMapOrder tOrder>
143 class SideMap {
144  public:
145   typedef tScalar Scalar;
146   static constexpr SideMapOrder kOrder = tOrder;
147 
SideMap(Scalar * data,int width,int depth,int stride)148   SideMap(Scalar* data, int width, int depth, int stride)
149       : data_(data), width_(width), depth_(depth), stride_(stride) {}
150 
SideMap(Scalar * data,int width,int depth)151   SideMap(Scalar* data, int width, int depth)
152       : data_(data), width_(width), depth_(depth) {
153     stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
154   }
155 
SideMap(const SideMap & other)156   SideMap(const SideMap& other)
157       : data_(other.data_),
158         width_(other.width_),
159         depth_(other.depth_),
160         stride_(other.stride_) {}
161 
width()162   int width() const { return width_; }
depth()163   int depth() const { return depth_; }
stride()164   int stride() const { return stride_; }
width_stride()165   int width_stride() const {
166     return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
167   }
depth_stride()168   int depth_stride() const {
169     return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
170   }
data()171   Scalar* data() const { return data_; }
data(int w,int d)172   Scalar* data(int w, int d) const {
173     return data_ + w * width_stride() + d * depth_stride();
174   }
operator()175   Scalar operator()(int w, int d) const { return *data(w, d); }
operator()176   Scalar& operator()(int w, int d) { return *data(w, d); }
177 
block(int start_width,int start_depth,int block_width,int block_depth)178   SideMap block(int start_width, int start_depth, int block_width,
179                 int block_depth) const {
180     assert(start_width >= 0);
181     assert(start_width + block_width <= width_);
182     assert(start_depth >= 0);
183     assert(start_depth + block_depth <= depth_);
184 
185     return SideMap(data(start_width, start_depth), block_width, block_depth,
186                    stride_);
187   }
188 
189  private:
190   Scalar* data_;  // not owned.
191   int width_, depth_, stride_;
192 };
193 
194 // A PackingRegisterBlock is a small fixed-size block of a matrix being
195 // packed. This class is the generic non-optimized implementation,
196 // it is inherited by the generic implementation of PackingRegisterBlock,
197 // which may be overriden by template specialization. Overriding it is how
198 // one may provide optimized packing code paths.
199 //
200 // The packing of a block proceeds in two steps:
201 //   1. Ensuring that we have a complete block of source data, i.e. a block of
202 //      the compile-time prescribed size. This is where we handle unaligned
203 //      boundaries: if we don't have a complete block of source data, then
204 //      we copy and zero-extend it into a local temporary (complete_src_),
205 //      see MakeCompleteSrc. In the generic case, we do have a complete block,
206 //      so we just use it in-place, see UseCompleteSrcInPlace.
207 //   2. Packing a complete block into the destination, see Pack. This is the
208 //      most critical part, so it's convenient that unaligned boundaries have
209 //      already been handled in step 1.
210 template <typename SrcMapType, typename PackedSideBlock>
211 class PackingRegisterBlockBase {
212  public:
213   typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
214   typedef typename KernelSideFormat::Cell CellFormat;
215   typedef typename KernelSideFormat::InputScalar KernelInputScalar;
216   typedef typename KernelSideFormat::Scalar KernelScalar;
217   static constexpr int kCells = KernelSideFormat::kCells;
218   static constexpr int kCellWidth = CellFormat::kWidth;
219   static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
220   static constexpr int kCellDepth = CellFormat::kDepth;
221   static constexpr int kCellSize = CellFormat::kSize;
222   static constexpr SideMapOrder kSrcOrder = SrcMapType::kOrder;
223   static constexpr int kZeroPointInputValue =
224       ZeroPointInputValue<KernelInputScalar, KernelScalar>::kValue;
225 
PackingRegisterBlockBase()226   PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
227 
228  protected:
229   // The source data that's ready for packing. May point to
230   // in-place actual source data if it's already a complete block,
231   // (see UseCompleteSrcInPlace)
232   // or to the local buf_ below into which we copy incomplete blocks
233   // (see MakeCompleteSrc)
234   SrcMapType complete_src_;
235 
236   // Temporary buffer for loading incomplete blocks to,
237   // in the source storage order
238   std::uint8_t buf_[kKernelWidth * kRegisterSize];
239 
240  public:
241   // Selects a block if in-place source data that's already a complete block.
UseCompleteSrcInPlace(const SrcMapType & src)242   void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
243   // Copies an incomplete block of source data into a local temporary
244   // complete block by zero-extending it.
MakeCompleteSrc(const SrcMapType & src)245   void MakeCompleteSrc(const SrcMapType& src) {
246     memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize);
247     if (kSrcOrder == SideMapOrder::WidthMajor) {
248       for (int w = 0; w < src.width(); w++) {
249         memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
250       }
251     } else {
252       assert(kSrcOrder == SideMapOrder::DepthMajor);
253       for (int d = 0; d < src.depth(); d++) {
254         memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
255       }
256     }
257 
258     // Since the KernelInputScalar type may not be uint8, we need to cast buf_.
259     complete_src_ = SrcMapType(reinterpret_cast<KernelInputScalar*>(buf_),
260                                kKernelWidth, kRegisterSize);
261   }
262   // Packs a complete block into the destination. This is the most
263   // critical part and the part that we most typically want to
264   // override in architecture-specific optimized specializations.
Pack(PackedSideBlock * dst,int start_width)265   void Pack(PackedSideBlock* dst, int start_width) {
266     std::uint8_t* dst_ptr = dst->current_data();
267     for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
268          cell_start_depth += kCellDepth) {
269       for (int cell_start_width = 0; cell_start_width < kKernelWidth;
270            cell_start_width += kCellWidth) {
271         std::int32_t* cell_sums_of_each_slice_ptr =
272             dst->sums_of_each_slice() + start_width + cell_start_width;
273         const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
274             complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
275                                 kCellDepth));
276         for (int w = 0; w < kCellWidth; w++) {
277           std::int32_t sum = 0;
278           for (int d = 0; d < kCellDepth; d++) {
279             const std::uint8_t src_val = src_cell_map(w, d);
280             const std::int16_t kernel_val_unwrapped =
281                 src_val - kZeroPointInputValue;
282             const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped;
283             dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8;
284             sum += kernel_val_unwrapped;
285           }
286           cell_sums_of_each_slice_ptr[w] += sum;
287         }
288         dst_ptr += kCellSize;
289       }
290     }
291     dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
292   }
293 };
294 
295 template <typename SrcMapType, typename PackedSideBlock>
296 class PackingRegisterBlock
297     : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {};
298 
299 // Large-scale implementation of packing.
300 template <typename SrcMapType, typename PackedSideBlock>
301 class PackSideBlockImpl {
302  public:
303   typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
304   typedef typename KernelSideFormat::Cell CellFormat;
305   static constexpr int kCells = KernelSideFormat::kCells;
306   static constexpr int kCellWidth = CellFormat::kWidth;
307   static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
308   static constexpr int kCellDepth = CellFormat::kDepth;
309 
310   typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
311       PackingRegisterBlockType;
312 
PackSideBlockImpl(PackedSideBlock * packed_side_block,const SrcMapType & src_map)313   PackSideBlockImpl(PackedSideBlock* packed_side_block,
314                     const SrcMapType& src_map)
315       : packed_side_block_(packed_side_block), src_map_(src_map) {}
316 
packed_side_block()317   PackedSideBlock* packed_side_block() const { return packed_side_block_; }
318 
src_map()319   const SrcMapType& src_map() const { return src_map_; }
320 
321   // The public entry point to pack a block.
PackL2()322   void PackL2() {
323     memset(packed_side_block_->sums_of_each_slice(), 0,
324            sizeof(std::int32_t) * packed_side_block_->params().l2_width);
325     for (int d = 0; d < src_map_.depth();
326          d += packed_side_block_->params().l1_depth) {
327       int ds = std::min<int>(packed_side_block_->params().l1_depth,
328                              src_map_.depth() - d);
329 
330       for (int w = 0; w < src_map_.width();
331            w += packed_side_block_->params().l1_width) {
332         int ws = std::min<int>(packed_side_block_->params().l1_width,
333                                src_map_.width() - w);
334 
335         PrefetchL1(w, ws, d, ds);
336         PackL1(w, ws, d, ds);
337       }
338     }
339   }
340 
341  protected:
342   // The intermediate-level loops, between PackL2 and PackRun.
PackL1(int start_width,int width,int start_depth,int depth)343   void PackL1(int start_width, int width, int start_depth, int depth) {
344     for (int w = 0; w < width; w += kKernelWidth) {
345       int ws = std::min(+kKernelWidth, width - w);
346       packed_side_block_->seek_run(start_width + w, start_depth);
347       PackRun(start_width + w, ws, start_depth, depth);
348     }
349   }
350 
351   // Prefetches the data that will be read by PackL1.
PrefetchL1(int start_width,int width,int start_depth,int depth)352   void PrefetchL1(int start_width, int width, int start_depth, int depth) {
353     if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
354       for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
355         for (int w = 0; w < width; w += 1) {
356           Prefetch(src_map_.data(start_width + w, start_depth + d));
357         }
358       }
359     } else {
360       for (int d = 0; d < depth; d++) {
361         for (int w = 0; w < width; w += kDefaultCacheLineSize) {
362           Prefetch(src_map_.data(start_width + w, start_depth + d));
363         }
364       }
365     }
366   }
367 
368   // PackRun packs only a run i.e. is the inner loop in the depth dimension.
PackRun(int start_width,int width,int start_depth,int depth)369   void PackRun(int start_width, int width, int start_depth, int depth) {
370     PackingRegisterBlockType b;
371     if (width == kKernelWidth) {
372       const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
373       if (register_aligned_depth) {
374         for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
375           b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
376                                                  width, kRegisterSize));
377           b.Pack(packed_side_block_, start_width);
378         }
379       }
380       if (register_aligned_depth < depth) {
381         b.MakeCompleteSrc(
382             src_map_.block(start_width, start_depth + register_aligned_depth,
383                            width, depth - register_aligned_depth));
384         b.Pack(packed_side_block_, start_width);
385       }
386     } else {
387       assert(width < kKernelWidth);
388       for (int d = 0; d < depth; d += kRegisterSize) {
389         const int ds = std::min(+kRegisterSize, depth - d);
390         b.MakeCompleteSrc(
391             src_map_.block(start_width, start_depth + d, width, ds));
392         b.Pack(packed_side_block_, start_width);
393       }
394     }
395   }
396 
397   // The PackedSideBlock being packed, i.e. the 'destination'.
398   PackedSideBlock* const packed_side_block_;
399 
400   // A map on the block of the original matrix block being packed,
401   // i.e. the 'source'.
402   const SrcMapType& src_map_;
403 };
404 
405 // Packs a block of the input LHS matrix, into a PackedSideBlock.
406 template <typename PackedSideBlock, typename MatrixMapType>
PackLhs(PackedSideBlock * dst,const MatrixMapType & src)407 void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
408   ScopedProfilingLabel label("pack LHS");
409   static const SideMapOrder kSideMapOrder =
410       MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
411                                                   : SideMapOrder::DepthMajor;
412   typedef typename MatrixMapType::Scalar Scalar;
413   typedef SideMap<Scalar, kSideMapOrder> SideMapType;
414   SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
415   typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
416   ImplType impl(dst, src_side_map);
417   impl.PackL2();
418 }
419 
420 // Packs a block of the input RHS matrix, into a PackedSideBlock.
421 template <typename PackedSideBlock, typename MatrixMapType>
PackRhs(PackedSideBlock * dst,const MatrixMapType & src)422 void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
423   ScopedProfilingLabel label("pack RHS");
424   static const SideMapOrder kSideMapOrder =
425       MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
426                                                   : SideMapOrder::DepthMajor;
427   typedef typename MatrixMapType::Scalar Scalar;
428   typedef SideMap<Scalar, kSideMapOrder> SideMapType;
429   SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
430   typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
431   ImplType impl(dst, src_side_map);
432   impl.PackL2();
433 }
434 
435 }  // namespace gemmlowp
436 
437 #ifdef GEMMLOWP_NEON
438 #include "pack_neon.h"
439 #elif defined(GEMMLOWP_SSE4)
440 #include "pack_sse.h"
441 #elif defined(GEMMLOWP_AVX2)
442 #include "pack_avx.h"
443 #elif defined(GEMMLOWP_MSA)
444 #include "pack_msa.h"
445 #endif
446 
447 #endif  // GEMMLOWP_INTERNAL_PACK_H_
448