1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_ssd_and_sad_calculator_neon.c
24 *
25 * @brief
26 * Contains intrinsic definitions of functions for ssd and sad computation
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 * None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes */
40 /*****************************************************************************/
41 /* System include files */
42 #include <string.h>
43 #include <assert.h>
44 #include <arm_neon.h>
45
46 /* User include files */
47 #include "ihevc_typedefs.h"
48 #include "itt_video_api.h"
49 #include "ihevc_cmn_utils_neon.h"
50 #include "ihevce_cmn_utils_instr_set_router.h"
51
52 /*****************************************************************************/
53 /* Function Definitions */
54 /*****************************************************************************/
ihevce_ssd_and_sad_calculator_neon(UWORD8 * pu1_recon,WORD32 recon_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD32 trans_size,UWORD32 * pu4_blk_sad)55 LWORD64 ihevce_ssd_and_sad_calculator_neon(
56 UWORD8 *pu1_recon,
57 WORD32 recon_strd,
58 UWORD8 *pu1_src,
59 WORD32 src_strd,
60 WORD32 trans_size,
61 UWORD32 *pu4_blk_sad)
62 {
63 WORD32 i, ssd = 0;
64
65 if(4 == trans_size)
66 {
67 const uint8x16_t src_u8 = load_unaligned_u8q(pu1_src, src_strd);
68 const uint8x16_t ref_u8 = load_unaligned_u8q(pu1_recon, recon_strd);
69 const uint8x8_t abs_l = vabd_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
70 const uint8x8_t abs_h = vabd_u8(vget_high_u8(src_u8), vget_high_u8(ref_u8));
71 const uint16x8_t sq_abs_l = vmull_u8(abs_l, abs_l);
72 const uint16x8_t sq_abs_h = vmull_u8(abs_h, abs_h);
73 uint16x8_t abs_sum;
74 uint32x4_t b, d;
75 uint32x2_t ssd, sad;
76 uint64x2_t c;
77
78 abs_sum = vaddl_u8(abs_l, abs_h);
79 b = vpaddlq_u16(abs_sum);
80 c = vpaddlq_u32(b);
81 sad =
82 vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), vreinterpret_u32_u64(vget_high_u64(c)));
83 *pu4_blk_sad = vget_lane_u32(sad, 0);
84 b = vaddl_u16(vget_low_u16(sq_abs_l), vget_high_u16(sq_abs_l));
85 d = vaddl_u16(vget_low_u16(sq_abs_h), vget_high_u16(sq_abs_h));
86 b = vaddq_u32(b, d);
87 ssd = vadd_u32(vget_low_u32(b), vget_high_u32(b));
88
89 return vget_lane_u64(vpaddl_u32(ssd), 0);
90 }
91 else if(8 == trans_size)
92 {
93 uint16x8_t abs_sum = vdupq_n_u16(0);
94 uint32x4_t sqabs_sum = vdupq_n_u32(0);
95 uint16x8_t abs, sqabs;
96 uint32x4_t tmp_a;
97 uint32x2_t sad, ssd;
98 uint64x2_t tmp_b;
99
100 for(i = 0; i < 8; i++)
101 {
102 const uint8x8_t src = vld1_u8(pu1_src);
103 const uint8x8_t ref = vld1_u8(pu1_recon);
104
105 abs = vabdl_u8(src, ref);
106 sqabs = vmulq_u16(abs, abs);
107 abs_sum = vaddq_u16(abs_sum, abs);
108 tmp_a = vaddl_u16(vget_low_u16(sqabs), vget_high_u16(sqabs));
109 sqabs_sum = vaddq_u32(sqabs_sum, tmp_a);
110
111 pu1_src += src_strd;
112 pu1_recon += recon_strd;
113 }
114 tmp_a = vpaddlq_u16(abs_sum);
115 tmp_b = vpaddlq_u32(tmp_a);
116 sad = vadd_u32(
117 vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
118 *pu4_blk_sad = vget_lane_u32(sad, 0);
119 ssd = vadd_u32(vget_low_u32(sqabs_sum), vget_high_u32(sqabs_sum));
120
121 return vget_lane_u64(vpaddl_u32(ssd), 0);
122 }
123 else if(16 == trans_size)
124 {
125 uint16x8_t abs_sum_l = vdupq_n_u16(0);
126 uint16x8_t abs_sum_h = vdupq_n_u16(0);
127 uint32x4_t sqabs_sum_l = vdupq_n_u32(0);
128 uint32x4_t sqabs_sum_h = vdupq_n_u32(0);
129 uint16x8_t abs_l, abs_h;
130 uint16x8_t sqabs_l, sqabs_h;
131 uint32x4_t tmp_a, tmp_c;
132 uint64x2_t tmp_b;
133 uint32x2_t sad, ssd;
134 WORD32 i;
135
136 for(i = 0; i < 16; i++)
137 {
138 const uint8x16_t src = vld1q_u8(pu1_src);
139 const uint8x16_t pred = vld1q_u8(pu1_recon);
140
141 abs_l = vabdl_u8(vget_low_u8(src), vget_low_u8(pred));
142 abs_h = vabdl_u8(vget_high_u8(src), vget_high_u8(pred));
143
144 sqabs_l = vmulq_u16(abs_l, abs_l);
145 sqabs_h = vmulq_u16(abs_h, abs_h);
146
147 abs_sum_l = vaddq_u16(abs_sum_l, abs_l);
148 abs_sum_h = vaddq_u16(abs_sum_h, abs_h);
149
150 tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
151 tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
152
153 sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
154 sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
155 pu1_src += src_strd;
156 pu1_recon += recon_strd;
157 }
158 tmp_a = vpaddlq_u16(abs_sum_l);
159 tmp_a = vpadalq_u16(tmp_a, abs_sum_h);
160 tmp_b = vpaddlq_u32(tmp_a);
161 sad = vadd_u32(
162 vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
163 *pu4_blk_sad = vget_lane_u32(sad, 0);
164
165 sqabs_sum_l = vaddq_u32(sqabs_sum_l, sqabs_sum_h);
166 ssd = vadd_u32(vget_low_u32(sqabs_sum_l), vget_high_u32(sqabs_sum_l));
167
168 return vget_lane_u64(vpaddl_u32(ssd), 0);
169 }
170 else if(32 == trans_size)
171 {
172 uint16x8_t abs_sum = vdupq_n_u16(0);
173 uint16x8_t abs_sum_l, abs_sum_h;
174 uint32x4_t sqabs_sum_l = vdupq_n_u32(0);
175 uint32x4_t sqabs_sum_h = vdupq_n_u32(0);
176 uint8x8_t abs_l, abs_h;
177 uint16x8_t sqabs_l, sqabs_h;
178 uint32x4_t tmp_a, tmp_c;
179 uint64x2_t tmp_b;
180 uint32x2_t sad, ssd;
181 WORD32 i;
182
183 for(i = 0; i < 32; i++)
184 {
185 const uint8x16_t src_0 = vld1q_u8(pu1_src);
186 const uint8x16_t pred_0 = vld1q_u8(pu1_recon);
187 const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
188 const uint8x16_t pred_1 = vld1q_u8(pu1_recon + 16);
189
190 abs_l = vabd_u8(vget_low_u8(src_0), vget_low_u8(pred_0));
191 abs_h = vabd_u8(vget_high_u8(src_0), vget_high_u8(pred_0));
192
193 abs_sum_l = vaddl_u8(abs_l, abs_h);
194 sqabs_l = vmull_u8(abs_l, abs_l);
195 sqabs_h = vmull_u8(abs_h, abs_h);
196 tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
197 tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
198 sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
199 sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
200
201 abs_l = vabd_u8(vget_low_u8(src_1), vget_low_u8(pred_1));
202 abs_h = vabd_u8(vget_high_u8(src_1), vget_high_u8(pred_1));
203
204 abs_sum_h = vaddl_u8(abs_l, abs_h);
205 sqabs_l = vmull_u8(abs_l, abs_l);
206 sqabs_h = vmull_u8(abs_h, abs_h);
207 tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
208 tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
209 sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
210 sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
211
212 abs_sum_l = vaddq_u16(abs_sum_l, abs_sum_h);
213 abs_sum = vaddq_u16(abs_sum, abs_sum_l);
214
215 pu1_src += src_strd;
216 pu1_recon += recon_strd;
217 }
218 tmp_a = vpaddlq_u16(abs_sum);
219 tmp_b = vpaddlq_u32(tmp_a);
220 sad = vadd_u32(
221 vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
222 *pu4_blk_sad = vget_lane_u32(sad, 0);
223
224 sqabs_sum_l = vaddq_u32(sqabs_sum_l, sqabs_sum_h);
225 ssd = vadd_u32(vget_low_u32(sqabs_sum_l), vget_high_u32(sqabs_sum_l));
226
227 return vget_lane_u64(vpaddl_u32(ssd), 0);
228 }
229 else if(64 == trans_size)
230 {
231 uint32x4_t abs_sum = vdupq_n_u32(0);
232 uint16x8_t abs_sum_0, abs_sum_1, abs_sum_2, abs_sum_3;
233 uint32x4_t sqabs_sum_l = vdupq_n_u32(0);
234 uint32x4_t sqabs_sum_h = vdupq_n_u32(0);
235 uint8x8_t abs_l, abs_h;
236 uint16x8_t sqabs_l, sqabs_h;
237 uint32x4_t tmp_a, tmp_c;
238 uint64x2_t tmp_b;
239 uint32x2_t sad, ssd;
240 WORD32 i;
241
242 for(i = 0; i < 64; i++)
243 {
244 const uint8x16_t src_0 = vld1q_u8(pu1_src);
245 const uint8x16_t pred_0 = vld1q_u8(pu1_recon);
246 const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
247 const uint8x16_t pred_1 = vld1q_u8(pu1_recon + 16);
248 const uint8x16_t src_2 = vld1q_u8(pu1_src + 32);
249 const uint8x16_t pred_2 = vld1q_u8(pu1_recon + 32);
250 const uint8x16_t src_3 = vld1q_u8(pu1_src + 48);
251 const uint8x16_t pred_3 = vld1q_u8(pu1_recon + 48);
252
253 abs_l = vabd_u8(vget_low_u8(src_0), vget_low_u8(pred_0));
254 abs_h = vabd_u8(vget_high_u8(src_0), vget_high_u8(pred_0));
255
256 abs_sum_0 = vaddl_u8(abs_l, abs_h);
257 sqabs_l = vmull_u8(abs_l, abs_l);
258 sqabs_h = vmull_u8(abs_h, abs_h);
259 tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
260 tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
261 sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
262 sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
263
264 abs_l = vabd_u8(vget_low_u8(src_1), vget_low_u8(pred_1));
265 abs_h = vabd_u8(vget_high_u8(src_1), vget_high_u8(pred_1));
266
267 abs_sum_1 = vaddl_u8(abs_l, abs_h);
268 sqabs_l = vmull_u8(abs_l, abs_l);
269 sqabs_h = vmull_u8(abs_h, abs_h);
270 tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
271 tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
272 sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
273 sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
274
275 abs_l = vabd_u8(vget_low_u8(src_2), vget_low_u8(pred_2));
276 abs_h = vabd_u8(vget_high_u8(src_2), vget_high_u8(pred_2));
277
278 abs_sum_2 = vaddl_u8(abs_l, abs_h);
279 sqabs_l = vmull_u8(abs_l, abs_l);
280 sqabs_h = vmull_u8(abs_h, abs_h);
281 tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
282 tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
283 sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
284 sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
285
286 abs_l = vabd_u8(vget_low_u8(src_3), vget_low_u8(pred_3));
287 abs_h = vabd_u8(vget_high_u8(src_3), vget_high_u8(pred_3));
288
289 abs_sum_3 = vaddl_u8(abs_l, abs_h);
290 sqabs_l = vmull_u8(abs_l, abs_l);
291 sqabs_h = vmull_u8(abs_h, abs_h);
292 tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
293 tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
294 sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
295 sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
296
297 abs_sum_0 = vaddq_u16(abs_sum_0, abs_sum_1);
298 abs_sum_2 = vaddq_u16(abs_sum_2, abs_sum_3);
299 abs_sum_0 = vaddq_u16(abs_sum_0, abs_sum_2);
300 tmp_a = vaddl_u16(vget_low_u16(abs_sum_0), vget_high_u16(abs_sum_0));
301 abs_sum = vaddq_u32(abs_sum, tmp_a);
302
303 pu1_src += src_strd;
304 pu1_recon += recon_strd;
305 }
306 tmp_b = vpaddlq_u32(abs_sum);
307 sad = vadd_u32(
308 vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
309 *pu4_blk_sad = vget_lane_u32(sad, 0);
310
311 sqabs_sum_l = vaddq_u32(sqabs_sum_l, sqabs_sum_h);
312 ssd = vadd_u32(vget_low_u32(sqabs_sum_l), vget_high_u32(sqabs_sum_l));
313
314 return vget_lane_u64(vpaddl_u32(ssd), 0);
315 }
316 return (ssd);
317 }
318