1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // unpack.h: unpacking the result blocks computed by compute.h,
16 // storing them into the destination matrix.
17
18 #ifndef GEMMLOWP_INTERNAL_UNPACK_H_
19 #define GEMMLOWP_INTERNAL_UNPACK_H_
20
21 #include "allocator.h"
22 #include "block_params.h"
23 #include "output.h"
24 #include "pack.h"
25
26 #include <cmath>
27
28 namespace gemmlowp {
29
30 class PackedResult {
31 public:
PackedResult(Allocator * _allocator,const BlockParams & _block_params)32 PackedResult(Allocator* _allocator, const BlockParams& _block_params)
33 : allocator_(_allocator), block_params_(_block_params) {
34 matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows *
35 block_params_.l2_cols);
36 }
37
~PackedResult()38 ~PackedResult() {}
39
Map()40 MatrixMap<std::int32_t, MapOrder::ColMajor> Map() {
41 return MatrixMap<std::int32_t, MapOrder::ColMajor>(
42 allocator_->GetPointer<std::int32_t>(matrix_handle_),
43 block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
44 }
45
Map()46 MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const {
47 return MatrixMap<const std::int32_t, MapOrder::ColMajor>(
48 allocator_->GetPointer<const std::int32_t>(matrix_handle_),
49 block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
50 }
51
52 private:
53 Allocator* allocator_;
54 Allocator::Handle matrix_handle_;
55 const BlockParams& block_params_;
56 };
57
58 template <std::uint32_t numerator, std::uint32_t denominator>
RoundingMultiplyByConstantFraction(std::int32_t x)59 std::int32_t RoundingMultiplyByConstantFraction(std::int32_t x) {
60 if (numerator == denominator) {
61 return x;
62 }
63
64 // We'll use only signed arithmetic here. This is
65 // simpler (since this function operates on signed int32's) and
66 // more friendly to ARM NEON, where this allows us to use the
67 // VQRDMULH instruction.
68 static const std::int32_t int_quotient =
69 (numerator + denominator / 2) / denominator;
70 static const std::int32_t remaining_numerator =
71 numerator - int_quotient * denominator;
72 static const std::int32_t scaled_remaining_numerator =
73 static_cast<std::int32_t>(
74 (static_cast<std::int64_t>(remaining_numerator) * (1ll << 31)) /
75 denominator);
76
77 const std::int64_t scaled_remaining_product =
78 static_cast<std::int64_t>(x) *
79 static_cast<std::int64_t>(scaled_remaining_numerator);
80
81 const std::int32_t scaled_remaining_product_nudge =
82 (scaled_remaining_product > 0 ? 1 : -1) * (1 << 30);
83
84 const std::int32_t remaining_product = static_cast<std::int32_t>(
85 (scaled_remaining_product + scaled_remaining_product_nudge) / (1u << 31));
86
87 return x * int_quotient + remaining_product;
88 }
89
90 template <typename BitDepthParams, typename ResultBlockType,
91 typename PackedResultType, typename LhsOffset, typename RhsOffset,
92 typename OutputPipelineType>
93 struct UnpackResultImplGeneric {
UnpackUnpackResultImplGeneric94 static void Unpack(ResultBlockType* dst, const PackedResultType& src,
95 int depth, const std::int32_t* lhs_sums_of_each_slice,
96 const std::int32_t* rhs_sums_of_each_slice,
97 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
98 const OutputPipelineType& output_pipeline) {
99 auto src_map = src.Map();
100 // No top-level blocking in the depth dimension at the moment.
101 // Too much loss of precision.
102 const int kLhsBits = BitDepthParams::LhsBitDepth::kBits;
103 const int kRhsBits = BitDepthParams::RhsBitDepth::kBits;
104 const std::int32_t kLhsMax = (1 << kLhsBits) - 1;
105 const std::int32_t kRhsMax = (1 << kRhsBits) - 1;
106 OutputPipelineExecutor<OutputPipelineType, FragmentInt32x1x1>
107 output_pipeline_executor(output_pipeline);
108 for (int c = 0; c < dst->cols(); c++) {
109 for (int r = 0; r < dst->rows(); r++) {
110 // To understand this code, read
111 // doc/low-precision.txt
112 // doc/less-than-8-bit.txt
113 // We have 4 terms to sum: xx, x1, 1x, 11.
114 // In case of requantization, we first need to scale them back
115 // to the original scale, using RoundingMultiplyByConstantFraction.
116 std::int32_t raw_xx = src_map(r, c);
117 std::int32_t raw_x1 = lhs_sums_of_each_slice[r] * rhs_offset(c);
118 std::int32_t raw_1x = rhs_sums_of_each_slice[c] * lhs_offset(r);
119 std::int32_t term_xx =
120 RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>(
121 raw_xx);
122 std::int32_t term_x1 =
123 RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1);
124 std::int32_t term_1x =
125 RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x);
126 std::int32_t term_11 = lhs_offset(r) * rhs_offset(c) * depth;
127 // Sum the 4 terms.
128 FragmentInt32x1x1 sum = term_xx + term_x1 + term_1x + term_11;
129
130 output_pipeline_executor.Execute(sum, dst, r, c);
131 }
132 }
133 }
134 };
135
136 template <typename BitDepthParams, typename ResultBlockType,
137 typename PackedResultType, typename LhsOffset, typename RhsOffset,
138 typename OutputPipelineType>
139 struct UnpackResultImpl
140 : UnpackResultImplGeneric<BitDepthParams, ResultBlockType, PackedResultType,
141 LhsOffset, RhsOffset, OutputPipelineType> {};
142
143 template <typename BitDepthParams, typename ResultBlockType,
144 typename PackedResultType, typename LhsOffset, typename RhsOffset,
145 typename OutputPipelineType>
UnpackResult(ResultBlockType * dst,const PackedResultType & src,int depth,const std::int32_t * lhs_sums_of_each_slice,const std::int32_t * rhs_sums_of_each_slice,const LhsOffset & lhs_offset,const RhsOffset & rhs_offset,const OutputPipelineType & output_pipeline)146 void UnpackResult(ResultBlockType* dst, const PackedResultType& src, int depth,
147 const std::int32_t* lhs_sums_of_each_slice,
148 const std::int32_t* rhs_sums_of_each_slice,
149 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
150 const OutputPipelineType& output_pipeline) {
151 ScopedProfilingLabel label("unpack");
152 UnpackResultImpl<BitDepthParams, ResultBlockType, PackedResultType,
153 LhsOffset, RhsOffset, OutputPipelineType>::Unpack(
154 dst, src, depth, lhs_sums_of_each_slice, rhs_sums_of_each_slice,
155 lhs_offset, rhs_offset, output_pipeline);
156 }
157
158 } // namespace gemmlowp
159
160 #ifdef GEMMLOWP_NEON
161 #include "unpack_neon.h"
162 #endif
163
164 #endif // GEMMLOWP_INTERNAL_UNPACK_H_
165