1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // compute.h: the central stage of the Gemm computation, operates
16 // on already-packed LHS and RHS blocks and calls the Gemm kernel
17 // to compute a block of the product.
18 
19 #ifndef GEMMLOWP_INTERNAL_COMPUTE_H_
20 #define GEMMLOWP_INTERNAL_COMPUTE_H_
21 
22 #include "block_params.h"
23 #include "kernel.h"
24 #include "pack.h"
25 
26 namespace gemmlowp {
27 
28 template <typename PackedLhs, typename PackedRhs, typename PackedResult>
29 class ComputeImpl {
30   typedef typename PackedLhs::KernelSideFormat KernelLhsFormat;
31   typedef typename PackedRhs::KernelSideFormat KernelRhsFormat;
32   typedef KernelFormat<KernelLhsFormat, KernelRhsFormat> Format;
33 
34   const KernelBase& kernel_;
35   const BlockParams& block_params_;
36 
37   PackedResult* const packed_result_;
38   const PackedLhs& packed_lhs_;
39   const PackedRhs& packed_rhs_;
40 
41  public:
ComputeImpl(const KernelBase & _kernel,const BlockParams & _block_params,PackedResult * _packed_result,const PackedLhs & _packed_lhs,const PackedRhs & _packed_rhs)42   ComputeImpl(const KernelBase& _kernel, const BlockParams& _block_params,
43               PackedResult* _packed_result, const PackedLhs& _packed_lhs,
44               const PackedRhs& _packed_rhs)
45       : kernel_(_kernel),
46         block_params_(_block_params),
47         packed_result_(_packed_result),
48         packed_lhs_(_packed_lhs),
49         packed_rhs_(_packed_rhs) {}
50 
Compute(int depth)51   void Compute(int depth) {
52     depth = RoundUp<Format::kDepth>(depth);
53     assert(depth <= block_params_.l2_depth);
54     for (int d = 0; d < depth; d += block_params_.l1_depth) {
55       int ds = std::min(block_params_.l1_depth, depth - d);
56 
57       for (int r = 0; r < block_params_.l2_rows; r += block_params_.l1_rows) {
58         int rs = std::min(block_params_.l1_rows, block_params_.l2_rows - r);
59 
60         ComputeL1(r, rs, 0, block_params_.l2_cols, d, ds);
61       }
62     }
63   }
64 
65  private:
MarkPackedResultBlockAsInitialized(const MatrixMap<std::int32_t,MapOrder::ColMajor> & packed_result_block)66   static void MarkPackedResultBlockAsInitialized(
67       const MatrixMap<std::int32_t, MapOrder::ColMajor>& packed_result_block) {
68 #ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
69     for (int col = 0; col < packed_result_block.cols(); col++) {
70       MarkMemoryAsInitialized(
71           packed_result_block.data() + col * packed_result_block.cols_stride(),
72           packed_result_block.rows());
73     }
74 #else
75     (void)packed_result_block;
76 #endif
77   }
78 
ComputeRun(int start_row,int start_col,int start_depth,int depth)79   void ComputeRun(int start_row, int start_col, int start_depth,
80                   int depth) GEMMLOWP_NOINLINE {
81     packed_lhs_.seek_run(start_row, start_depth);
82     packed_rhs_.seek_run(start_col, start_depth);
83     auto packed_result_block = packed_result_->Map().block(
84         start_row, start_col, Format::kRows, Format::kCols);
85     kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(),
86                 packed_result_block.cols_stride(), packed_lhs_.current_data(),
87                 packed_rhs_.current_data(), start_depth, depth);
88     MarkPackedResultBlockAsInitialized(packed_result_block);
89   }
90 
ComputeL1(int start_row,int rows,int start_col,int cols,int start_depth,int depth)91   void ComputeL1(int start_row, int rows, int start_col, int cols,
92                  int start_depth, int depth) {
93     assert(rows % Format::kRows == 0);
94     assert(cols % Format::kCols == 0);
95     assert(depth % Format::kDepth == 0);
96 
97     for (int c = 0; c < cols; c += Format::kCols) {
98       for (int r = 0; r < rows; r += Format::kRows) {
99         ComputeRun(start_row + r, start_col + c, start_depth, depth);
100       }
101     }
102   }
103 };
104 
105 template <typename PackedLhs, typename PackedRhs, typename PackedResult>
Compute(const KernelBase & kernel,const BlockParams & block_params,PackedResult * packed_result,const PackedLhs & packed_lhs,const PackedRhs & packed_rhs,int depth)106 void Compute(const KernelBase& kernel, const BlockParams& block_params,
107              PackedResult* packed_result, const PackedLhs& packed_lhs,
108              const PackedRhs& packed_rhs, int depth) {
109   ScopedProfilingLabel label("compute");
110   ComputeImpl<PackedLhs, PackedRhs, PackedResult> impl(
111       kernel, block_params, packed_result, packed_lhs, packed_rhs);
112 
113   impl.Compute(depth);
114 }
115 
116 }  // namespace gemmlowp
117 
118 #endif  // GEMMLOWP_INTERNAL_COMPUTE_H_
119