1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // unpack_neon.h: optimized NEON specializations of the templates in unpack.h.
16
17 #ifndef GEMMLOWP_INTERNAL_UNPACK_NEON_H_
18 #define GEMMLOWP_INTERNAL_UNPACK_NEON_H_
19
20 #include "output_neon.h"
21 #include "unpack.h"
22
23 #include <arm_neon.h>
24
25 namespace gemmlowp {
26
27 template <std::uint32_t numerator, std::uint32_t denominator>
RoundingMultiplyByConstantFraction(int32x4_t x)28 int32x4_t RoundingMultiplyByConstantFraction(int32x4_t x) {
29 static_assert(numerator > 0 && denominator > 0,
30 "only supporting positive num/denom");
31
32 if (numerator == denominator) {
33 return x;
34 }
35
36 static const std::int32_t int_quotient =
37 (numerator + denominator / 2) / denominator;
38 static const std::int32_t remaining_numerator =
39 numerator - int_quotient * denominator;
40 static const std::int32_t scaled_remaining_numerator =
41 static_cast<std::int32_t>(
42 (static_cast<std::int64_t>(remaining_numerator) * (1ll << 31)) /
43 denominator);
44 // Note: vqrdmulh instruction is rounding doubling multiply high.
45 const int32x4_t remaining_product =
46 vqrdmulhq_n_s32(x, scaled_remaining_numerator);
47
48 return vmlaq_n_s32(remaining_product, x, int_quotient);
49 }
50
51 template <typename tScalar, VectorShape tShape>
get_int32x4_t_and_inc(ConstIterator<VectorMap<tScalar,tShape>> * iterator)52 int32x4_t get_int32x4_t_and_inc(
53 ConstIterator<VectorMap<tScalar, tShape>>* iterator) {
54 const int32x4_t result = vld1q_s32(iterator->get());
55 *iterator += 4;
56 return result;
57 }
58
59 template <typename tScalar, VectorShape tShape>
get_int32x4_t_and_inc(ConstIterator<VectorDup<tScalar,tShape>> * iterator)60 int32x4_t get_int32x4_t_and_inc(
61 ConstIterator<VectorDup<tScalar, tShape>>* iterator) {
62 const int32x4_t result = vdupq_n_s32(**iterator);
63 // Increment really does nothing for VectorDup.
64 *iterator += 4;
65 return result;
66 }
67
68 template <typename BitDepthParams, typename PackedResultType,
69 typename OutputScalar, typename LhsOffset, typename RhsOffset,
70 typename OutputPipelineType>
71 struct UnpackResultImpl<BitDepthParams,
72 MatrixMap<OutputScalar, MapOrder::ColMajor>,
73 PackedResultType, LhsOffset, RhsOffset,
74 OutputPipelineType> {
75 typedef MatrixMap<OutputScalar, MapOrder::ColMajor> ResultBlockType;
76 static void Unpack(ResultBlockType* dst, const PackedResultType& src,
77 int depth, const std::int32_t* lhs_sums_of_each_slice,
78 const std::int32_t* rhs_sums_of_each_slice,
79 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
80 const OutputPipelineType& output_pipeline) {
81 ScopedProfilingLabel label("optimized path (NEON)");
82 const int kLhsBits = BitDepthParams::LhsBitDepth::kBits;
83 const int kRhsBits = BitDepthParams::RhsBitDepth::kBits;
84 const std::int32_t kLhsMax = (1 << kLhsBits) - 1;
85 const std::int32_t kRhsMax = (1 << kRhsBits) - 1;
86 auto src_map = src.Map();
87 OutputPipelineExecutor<OutputPipelineType, FragmentInt32x1x1>
88 output_pipeline_executor_int32x1x1(output_pipeline);
89 OutputPipelineExecutor<OutputPipelineType, NEONFragmentInt32x4x1>
90 output_pipeline_executor_int32x4x1(output_pipeline);
91 OutputPipelineExecutor<OutputPipelineType, NEONFragmentInt32x16x1>
92 output_pipeline_executor_int32x16x1(output_pipeline);
93
94 for (int c = 0; c < dst->cols(); c++) {
95 const std::int32_t* src_ptr = src_map.data(0, c);
96 const std::int32_t* sums_of_each_slice_ptr = lhs_sums_of_each_slice;
97 auto lhs_offset_iter = const_iterator(lhs_offset);
98 const std::int32_t rhs_offset_c = rhs_offset(c);
99 const std::int32_t rhs_sums_of_each_slice_c = rhs_sums_of_each_slice[c];
100
101 // Handle 16 values at once for higher performance
102 int dst_rows_aligned16 = RoundDown<16>(dst->rows());
103 for (int r = 0; r < dst_rows_aligned16; r += 16) {
104 // Compute the sum of the 4 terms,
105 // q = term_xx + term_x1 + term_1x_plus_term_11
106 // Refer to the generic code in unpack.h.
107 int32x4_t raw_xx[4];
108 for (int i = 0; i < 4; i++) {
109 raw_xx[i] = vld1q_s32(src_ptr);
110 src_ptr += 4;
111 }
112 int32x4_t raw_x1[4];
113 for (int i = 0; i < 4; i++) {
114 const int32x4_t sum_x1 = vld1q_s32(sums_of_each_slice_ptr);
115 raw_x1[i] = vmulq_n_s32(sum_x1, rhs_offset_c);
116 sums_of_each_slice_ptr += 4;
117 }
118 int32x4_t raw_1x[4];
119 int32x4_t term_11[4];
120 for (int i = 0; i < 4; i++) {
121 const int32x4_t lhs_offsets = get_int32x4_t_and_inc(&lhs_offset_iter);
122 raw_1x[i] = vmulq_n_s32(lhs_offsets, rhs_sums_of_each_slice_c);
123 term_11[i] = vmulq_n_s32(lhs_offsets, rhs_offset_c * depth);
124 }
125 int32x4_t term_xx[4];
126 for (int i = 0; i < 4; i++) {
127 term_xx[i] =
128 RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>(
129 raw_xx[i]);
130 }
131 int32x4_t term_x1[4];
132 for (int i = 0; i < 4; i++) {
133 term_x1[i] =
134 RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1[i]);
135 }
136 int32x4_t term_1x[4];
137 for (int i = 0; i < 4; i++) {
138 term_1x[i] =
139 RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x[i]);
140 }
141 int32x4x4_t q;
142 for (int i = 0; i < 4; i++) {
143 q.val[i] = vaddq_s32(vaddq_s32(term_xx[i], term_x1[i]),
144 vaddq_s32(term_1x[i], term_11[i]));
145 }
146 NEONFragmentInt32x16x1 f(q);
147 output_pipeline_executor_int32x16x1.Execute(f, dst, r, c);
148 }
149 // We have finished handling groups of 16 entries at once; now
150 // try to handle 4 entries at once.
151 int dst_rows_aligned4 = RoundDown<4>(dst->rows());
152 for (int r = dst_rows_aligned16; r < dst_rows_aligned4; r += 4) {
153 // Compute the sum of the 4 terms,
154 // q = term_xx + term_x1 + term_1x_plus_term_11
155 // Refer to the generic code in unpack.h.
156 const int32x4_t raw_xx = vld1q_s32(src_ptr);
157 src_ptr += 4;
158 const int32x4_t term_xx =
159 RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>(
160 raw_xx);
161 const int32x4_t sum_x1 = vld1q_s32(sums_of_each_slice_ptr);
162 const int32x4_t raw_x1 = vmulq_n_s32(sum_x1, rhs_offset_c);
163 sums_of_each_slice_ptr += 4;
164 const int32x4_t term_x1 =
165 RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1);
166 const int32x4_t lhs_offsets = get_int32x4_t_and_inc(&lhs_offset_iter);
167 const int32x4_t raw_1x =
168 vmulq_n_s32(lhs_offsets, rhs_sums_of_each_slice_c);
169 const int32x4_t term_1x =
170 RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x);
171 const int32x4_t term_11 =
172 vmulq_n_s32(lhs_offsets, rhs_offset_c * depth);
173 int32x4_t q = vaddq_s32(vaddq_s32(term_xx, term_x1),
174 vaddq_s32(term_1x, term_11));
175 NEONFragmentInt32x4x1 f(q);
176 output_pipeline_executor_int32x4x1.Execute(f, dst, r, c);
177 }
178 // We have finished handling 4 entries at once; now handle
179 // remaining entries one by one. This scalar code is similar
180 // to the code in unpack.h, see comments there.
181 for (int r = dst_rows_aligned4; r < dst->rows(); r++) {
182 const std::int32_t raw_xx = src_map(r, c);
183 const std::int32_t raw_x1 = lhs_sums_of_each_slice[r] * rhs_offset_c;
184 const std::int32_t raw_1x = rhs_sums_of_each_slice_c * lhs_offset(r);
185 const std::int32_t term_xx =
186 RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>(
187 raw_xx);
188 const std::int32_t term_x1 =
189 RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1);
190 const std::int32_t term_1x =
191 RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x);
192 const std::int32_t term_11 = lhs_offset(r) * rhs_offset(c) * depth;
193 FragmentInt32x1x1 sum = term_xx + term_x1 + term_1x + term_11;
194 output_pipeline_executor_int32x1x1.Execute(sum, dst, r, c);
195 }
196 }
197 }
198 };
199
200 } // namespace gemmlowp
201
202 #endif // GEMMLOWP_INTERNAL_UNPACK_NEON_H_
203