1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_ssd_and_sad_calculator_neon.c
24 *
25 * @brief
26 *  Contains intrinsic definitions of functions for ssd and sad computation
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes                                                             */
40 /*****************************************************************************/
41 /* System include files */
42 #include <string.h>
43 #include <assert.h>
44 #include <arm_neon.h>
45 
46 /* User include files */
47 #include "ihevc_typedefs.h"
48 #include "itt_video_api.h"
49 #include "ihevc_cmn_utils_neon.h"
50 #include "ihevce_cmn_utils_instr_set_router.h"
51 
52 /*****************************************************************************/
53 /* Function Definitions                                                      */
54 /*****************************************************************************/
ihevce_ssd_and_sad_calculator_neon(UWORD8 * pu1_recon,WORD32 recon_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD32 trans_size,UWORD32 * pu4_blk_sad)55 LWORD64 ihevce_ssd_and_sad_calculator_neon(
56     UWORD8 *pu1_recon,
57     WORD32 recon_strd,
58     UWORD8 *pu1_src,
59     WORD32 src_strd,
60     WORD32 trans_size,
61     UWORD32 *pu4_blk_sad)
62 {
63     WORD32 i, ssd = 0;
64 
65     if(4 == trans_size)
66     {
67         const uint8x16_t src_u8 = load_unaligned_u8q(pu1_src, src_strd);
68         const uint8x16_t ref_u8 = load_unaligned_u8q(pu1_recon, recon_strd);
69         const uint8x8_t abs_l = vabd_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
70         const uint8x8_t abs_h = vabd_u8(vget_high_u8(src_u8), vget_high_u8(ref_u8));
71         const uint16x8_t sq_abs_l = vmull_u8(abs_l, abs_l);
72         const uint16x8_t sq_abs_h = vmull_u8(abs_h, abs_h);
73         uint16x8_t abs_sum;
74         uint32x4_t b, d;
75         uint32x2_t ssd, sad;
76         uint64x2_t c;
77 
78         abs_sum = vaddl_u8(abs_l, abs_h);
79         b = vpaddlq_u16(abs_sum);
80         c = vpaddlq_u32(b);
81         sad =
82             vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), vreinterpret_u32_u64(vget_high_u64(c)));
83         *pu4_blk_sad = vget_lane_u32(sad, 0);
84         b = vaddl_u16(vget_low_u16(sq_abs_l), vget_high_u16(sq_abs_l));
85         d = vaddl_u16(vget_low_u16(sq_abs_h), vget_high_u16(sq_abs_h));
86         b = vaddq_u32(b, d);
87         ssd = vadd_u32(vget_low_u32(b), vget_high_u32(b));
88 
89         return vget_lane_u64(vpaddl_u32(ssd), 0);
90     }
91     else if(8 == trans_size)
92     {
93         uint16x8_t abs_sum = vdupq_n_u16(0);
94         uint32x4_t sqabs_sum = vdupq_n_u32(0);
95         uint16x8_t abs, sqabs;
96         uint32x4_t tmp_a;
97         uint32x2_t sad, ssd;
98         uint64x2_t tmp_b;
99 
100         for(i = 0; i < 8; i++)
101         {
102             const uint8x8_t src = vld1_u8(pu1_src);
103             const uint8x8_t ref = vld1_u8(pu1_recon);
104 
105             abs = vabdl_u8(src, ref);
106             sqabs = vmulq_u16(abs, abs);
107             abs_sum = vaddq_u16(abs_sum, abs);
108             tmp_a = vaddl_u16(vget_low_u16(sqabs), vget_high_u16(sqabs));
109             sqabs_sum = vaddq_u32(sqabs_sum, tmp_a);
110 
111             pu1_src += src_strd;
112             pu1_recon += recon_strd;
113         }
114         tmp_a = vpaddlq_u16(abs_sum);
115         tmp_b = vpaddlq_u32(tmp_a);
116         sad = vadd_u32(
117             vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
118         *pu4_blk_sad = vget_lane_u32(sad, 0);
119         ssd = vadd_u32(vget_low_u32(sqabs_sum), vget_high_u32(sqabs_sum));
120 
121         return vget_lane_u64(vpaddl_u32(ssd), 0);
122     }
123     else if(16 == trans_size)
124     {
125         uint16x8_t abs_sum_l = vdupq_n_u16(0);
126         uint16x8_t abs_sum_h = vdupq_n_u16(0);
127         uint32x4_t sqabs_sum_l = vdupq_n_u32(0);
128         uint32x4_t sqabs_sum_h = vdupq_n_u32(0);
129         uint16x8_t abs_l, abs_h;
130         uint16x8_t sqabs_l, sqabs_h;
131         uint32x4_t tmp_a, tmp_c;
132         uint64x2_t tmp_b;
133         uint32x2_t sad, ssd;
134         WORD32 i;
135 
136         for(i = 0; i < 16; i++)
137         {
138             const uint8x16_t src = vld1q_u8(pu1_src);
139             const uint8x16_t pred = vld1q_u8(pu1_recon);
140 
141             abs_l = vabdl_u8(vget_low_u8(src), vget_low_u8(pred));
142             abs_h = vabdl_u8(vget_high_u8(src), vget_high_u8(pred));
143 
144             sqabs_l = vmulq_u16(abs_l, abs_l);
145             sqabs_h = vmulq_u16(abs_h, abs_h);
146 
147             abs_sum_l = vaddq_u16(abs_sum_l, abs_l);
148             abs_sum_h = vaddq_u16(abs_sum_h, abs_h);
149 
150             tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
151             tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
152 
153             sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
154             sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
155             pu1_src += src_strd;
156             pu1_recon += recon_strd;
157         }
158         tmp_a = vpaddlq_u16(abs_sum_l);
159         tmp_a = vpadalq_u16(tmp_a, abs_sum_h);
160         tmp_b = vpaddlq_u32(tmp_a);
161         sad = vadd_u32(
162             vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
163         *pu4_blk_sad = vget_lane_u32(sad, 0);
164 
165         sqabs_sum_l = vaddq_u32(sqabs_sum_l, sqabs_sum_h);
166         ssd = vadd_u32(vget_low_u32(sqabs_sum_l), vget_high_u32(sqabs_sum_l));
167 
168         return vget_lane_u64(vpaddl_u32(ssd), 0);
169     }
170     else if(32 == trans_size)
171     {
172         uint16x8_t abs_sum = vdupq_n_u16(0);
173         uint16x8_t abs_sum_l, abs_sum_h;
174         uint32x4_t sqabs_sum_l = vdupq_n_u32(0);
175         uint32x4_t sqabs_sum_h = vdupq_n_u32(0);
176         uint8x8_t abs_l, abs_h;
177         uint16x8_t sqabs_l, sqabs_h;
178         uint32x4_t tmp_a, tmp_c;
179         uint64x2_t tmp_b;
180         uint32x2_t sad, ssd;
181         WORD32 i;
182 
183         for(i = 0; i < 32; i++)
184         {
185             const uint8x16_t src_0 = vld1q_u8(pu1_src);
186             const uint8x16_t pred_0 = vld1q_u8(pu1_recon);
187             const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
188             const uint8x16_t pred_1 = vld1q_u8(pu1_recon + 16);
189 
190             abs_l = vabd_u8(vget_low_u8(src_0), vget_low_u8(pred_0));
191             abs_h = vabd_u8(vget_high_u8(src_0), vget_high_u8(pred_0));
192 
193             abs_sum_l = vaddl_u8(abs_l, abs_h);
194             sqabs_l = vmull_u8(abs_l, abs_l);
195             sqabs_h = vmull_u8(abs_h, abs_h);
196             tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
197             tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
198             sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
199             sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
200 
201             abs_l = vabd_u8(vget_low_u8(src_1), vget_low_u8(pred_1));
202             abs_h = vabd_u8(vget_high_u8(src_1), vget_high_u8(pred_1));
203 
204             abs_sum_h = vaddl_u8(abs_l, abs_h);
205             sqabs_l = vmull_u8(abs_l, abs_l);
206             sqabs_h = vmull_u8(abs_h, abs_h);
207             tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
208             tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
209             sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
210             sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
211 
212             abs_sum_l = vaddq_u16(abs_sum_l, abs_sum_h);
213             abs_sum = vaddq_u16(abs_sum, abs_sum_l);
214 
215             pu1_src += src_strd;
216             pu1_recon += recon_strd;
217         }
218         tmp_a = vpaddlq_u16(abs_sum);
219         tmp_b = vpaddlq_u32(tmp_a);
220         sad = vadd_u32(
221             vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
222         *pu4_blk_sad = vget_lane_u32(sad, 0);
223 
224         sqabs_sum_l = vaddq_u32(sqabs_sum_l, sqabs_sum_h);
225         ssd = vadd_u32(vget_low_u32(sqabs_sum_l), vget_high_u32(sqabs_sum_l));
226 
227         return vget_lane_u64(vpaddl_u32(ssd), 0);
228     }
229     else if(64 == trans_size)
230     {
231         uint32x4_t abs_sum = vdupq_n_u32(0);
232         uint16x8_t abs_sum_0, abs_sum_1, abs_sum_2, abs_sum_3;
233         uint32x4_t sqabs_sum_l = vdupq_n_u32(0);
234         uint32x4_t sqabs_sum_h = vdupq_n_u32(0);
235         uint8x8_t abs_l, abs_h;
236         uint16x8_t sqabs_l, sqabs_h;
237         uint32x4_t tmp_a, tmp_c;
238         uint64x2_t tmp_b;
239         uint32x2_t sad, ssd;
240         WORD32 i;
241 
242         for(i = 0; i < 64; i++)
243         {
244             const uint8x16_t src_0 = vld1q_u8(pu1_src);
245             const uint8x16_t pred_0 = vld1q_u8(pu1_recon);
246             const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
247             const uint8x16_t pred_1 = vld1q_u8(pu1_recon + 16);
248             const uint8x16_t src_2 = vld1q_u8(pu1_src + 32);
249             const uint8x16_t pred_2 = vld1q_u8(pu1_recon + 32);
250             const uint8x16_t src_3 = vld1q_u8(pu1_src + 48);
251             const uint8x16_t pred_3 = vld1q_u8(pu1_recon + 48);
252 
253             abs_l = vabd_u8(vget_low_u8(src_0), vget_low_u8(pred_0));
254             abs_h = vabd_u8(vget_high_u8(src_0), vget_high_u8(pred_0));
255 
256             abs_sum_0 = vaddl_u8(abs_l, abs_h);
257             sqabs_l = vmull_u8(abs_l, abs_l);
258             sqabs_h = vmull_u8(abs_h, abs_h);
259             tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
260             tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
261             sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
262             sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
263 
264             abs_l = vabd_u8(vget_low_u8(src_1), vget_low_u8(pred_1));
265             abs_h = vabd_u8(vget_high_u8(src_1), vget_high_u8(pred_1));
266 
267             abs_sum_1 = vaddl_u8(abs_l, abs_h);
268             sqabs_l = vmull_u8(abs_l, abs_l);
269             sqabs_h = vmull_u8(abs_h, abs_h);
270             tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
271             tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
272             sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
273             sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
274 
275             abs_l = vabd_u8(vget_low_u8(src_2), vget_low_u8(pred_2));
276             abs_h = vabd_u8(vget_high_u8(src_2), vget_high_u8(pred_2));
277 
278             abs_sum_2 = vaddl_u8(abs_l, abs_h);
279             sqabs_l = vmull_u8(abs_l, abs_l);
280             sqabs_h = vmull_u8(abs_h, abs_h);
281             tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
282             tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
283             sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
284             sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
285 
286             abs_l = vabd_u8(vget_low_u8(src_3), vget_low_u8(pred_3));
287             abs_h = vabd_u8(vget_high_u8(src_3), vget_high_u8(pred_3));
288 
289             abs_sum_3 = vaddl_u8(abs_l, abs_h);
290             sqabs_l = vmull_u8(abs_l, abs_l);
291             sqabs_h = vmull_u8(abs_h, abs_h);
292             tmp_a = vaddl_u16(vget_low_u16(sqabs_l), vget_high_u16(sqabs_l));
293             tmp_c = vaddl_u16(vget_low_u16(sqabs_h), vget_high_u16(sqabs_h));
294             sqabs_sum_l = vaddq_u32(sqabs_sum_l, tmp_a);
295             sqabs_sum_h = vaddq_u32(sqabs_sum_h, tmp_c);
296 
297             abs_sum_0 = vaddq_u16(abs_sum_0, abs_sum_1);
298             abs_sum_2 = vaddq_u16(abs_sum_2, abs_sum_3);
299             abs_sum_0 = vaddq_u16(abs_sum_0, abs_sum_2);
300             tmp_a = vaddl_u16(vget_low_u16(abs_sum_0), vget_high_u16(abs_sum_0));
301             abs_sum = vaddq_u32(abs_sum, tmp_a);
302 
303             pu1_src += src_strd;
304             pu1_recon += recon_strd;
305         }
306         tmp_b = vpaddlq_u32(abs_sum);
307         sad = vadd_u32(
308             vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
309         *pu4_blk_sad = vget_lane_u32(sad, 0);
310 
311         sqabs_sum_l = vaddq_u32(sqabs_sum_l, sqabs_sum_h);
312         ssd = vadd_u32(vget_low_u32(sqabs_sum_l), vget_high_u32(sqabs_sum_l));
313 
314         return vget_lane_u64(vpaddl_u32(ssd), 0);
315     }
316     return (ssd);
317 }
318