1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_sad_compute_neon.c
24 *
25 * @brief
26 *  Contains definitions of functions to compute sad
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes                                                             */
40 /*****************************************************************************/
41 /* System include files */
42 #include <string.h>
43 #include <assert.h>
44 #include <arm_neon.h>
45 
46 /* User include files */
47 #include "ihevc_typedefs.h"
48 #include "ihevc_macros.h"
49 #include "itt_video_api.h"
50 #include "ihevc_cmn_utils_neon.h"
51 #include "ihevce_ipe_instr_set_router.h"
52 
53 /*****************************************************************************/
54 /* Function Definitions                                                      */
55 /*****************************************************************************/
ihevce_4x4_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd)56 UWORD16 ihevce_4x4_sad_computer_neon(
57     UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
58 {
59     const uint8x16_t src_u8 = load_unaligned_u8q(pu1_src, src_strd);
60     const uint8x16_t ref_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
61     uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
62     uint32x4_t b;
63     uint64x2_t c;
64 
65     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
66     b = vpaddlq_u16(abs);
67     c = vpaddlq_u32(b);
68     return vget_lane_u32(
69         vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), vreinterpret_u32_u64(vget_high_u64(c))), 0);
70 }
71 
ihevce_8xn_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 ht)72 static UWORD16 ihevce_8xn_sad_computer_neon(
73     UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
74 {
75     uint16x8_t abs = vdupq_n_u16(0);
76     uint32x4_t tmp_a;
77     uint64x2_t tmp_b;
78     uint32x2_t sad;
79     WORD32 i;
80 
81     assert(ht <= 8);
82 
83     for(i = 0; i < ht; i++)
84     {
85         const uint8x8_t src = vld1_u8(pu1_src);
86         const uint8x8_t pred = vld1_u8(pu1_pred);
87 
88         abs = vabal_u8(abs, src, pred);
89         pu1_src += src_strd;
90         pu1_pred += pred_strd;
91     }
92     tmp_a = vpaddlq_u16(abs);
93     tmp_b = vpaddlq_u32(tmp_a);
94     sad = vadd_u32(
95         vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
96     return vget_lane_u32(sad, 0);
97 }
98 
ihevce_16xn_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 ht)99 static UWORD32 ihevce_16xn_sad_computer_neon(
100     UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
101 {
102     uint16x8_t abs_0 = vdupq_n_u16(0);
103     uint16x8_t abs_1 = vdupq_n_u16(0);
104     uint32x4_t tmp_a;
105     uint64x2_t tmp_b;
106     uint32x2_t sad;
107     WORD32 i;
108 
109     assert(ht <= 16);
110 
111     for(i = 0; i < ht; i++)
112     {
113         const uint8x16_t src = vld1q_u8(pu1_src);
114         const uint8x16_t pred = vld1q_u8(pu1_pred);
115 
116         abs_0 = vabal_u8(abs_0, vget_low_u8(src), vget_low_u8(pred));
117         abs_1 = vabal_u8(abs_1, vget_high_u8(src), vget_high_u8(pred));
118         pu1_src += src_strd;
119         pu1_pred += pred_strd;
120     }
121     tmp_a = vpaddlq_u16(abs_0);
122     tmp_a = vpadalq_u16(tmp_a, abs_1);
123     tmp_b = vpaddlq_u32(tmp_a);
124     sad = vadd_u32(
125         vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
126     return vget_lane_u32(sad, 0);
127 }
128 
ihevce_32xn_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 ht)129 static UWORD32 ihevce_32xn_sad_computer_neon(
130     UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
131 {
132     uint16x8_t abs_0 = vdupq_n_u16(0);
133     uint16x8_t abs_1 = vdupq_n_u16(0);
134     uint32x4_t tmp_a;
135     uint64x2_t tmp_b;
136     uint32x2_t sad;
137     WORD32 i;
138 
139     assert(ht <= 32);
140 
141     for(i = 0; i < ht; i++)
142     {
143         const uint8x16_t src_0 = vld1q_u8(pu1_src);
144         const uint8x16_t pred_0 = vld1q_u8(pu1_pred);
145         const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
146         const uint8x16_t pred_1 = vld1q_u8(pu1_pred + 16);
147 
148         abs_0 = vabal_u8(abs_0, vget_low_u8(src_0), vget_low_u8(pred_0));
149         abs_0 = vabal_u8(abs_0, vget_high_u8(src_0), vget_high_u8(pred_0));
150         abs_1 = vabal_u8(abs_1, vget_low_u8(src_1), vget_low_u8(pred_1));
151         abs_1 = vabal_u8(abs_1, vget_high_u8(src_1), vget_high_u8(pred_1));
152         pu1_src += src_strd;
153         pu1_pred += pred_strd;
154     }
155     tmp_a = vpaddlq_u16(abs_0);
156     tmp_a = vpadalq_u16(tmp_a, abs_1);
157     tmp_b = vpaddlq_u32(tmp_a);
158     sad = vadd_u32(
159         vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
160     return vget_lane_u32(sad, 0);
161 }
162 
ihevce_64xn_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 ht)163 static UWORD32 ihevce_64xn_sad_computer_neon(
164     UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
165 {
166     uint16x8_t abs_0 = vdupq_n_u16(0);
167     uint16x8_t abs_1 = vdupq_n_u16(0);
168     uint32x4_t tmp_a;
169     uint64x2_t tmp_b;
170     uint32x2_t sad;
171     WORD32 i;
172 
173     assert(ht <= 64);
174 
175     for(i = 0; i < ht; i++)
176     {
177         const uint8x16_t src_0 = vld1q_u8(pu1_src);
178         const uint8x16_t pred_0 = vld1q_u8(pu1_pred);
179         const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
180         const uint8x16_t pred_1 = vld1q_u8(pu1_pred + 16);
181         const uint8x16_t src_2 = vld1q_u8(pu1_src + 32);
182         const uint8x16_t pred_2 = vld1q_u8(pu1_pred + 32);
183         const uint8x16_t src_3 = vld1q_u8(pu1_src + 48);
184         const uint8x16_t pred_3 = vld1q_u8(pu1_pred + 48);
185 
186         abs_0 = vabal_u8(abs_0, vget_low_u8(src_0), vget_low_u8(pred_0));
187         abs_0 = vabal_u8(abs_0, vget_high_u8(src_0), vget_high_u8(pred_0));
188         abs_0 = vabal_u8(abs_0, vget_low_u8(src_1), vget_low_u8(pred_1));
189         abs_0 = vabal_u8(abs_0, vget_high_u8(src_1), vget_high_u8(pred_1));
190         abs_1 = vabal_u8(abs_1, vget_low_u8(src_2), vget_low_u8(pred_2));
191         abs_1 = vabal_u8(abs_1, vget_high_u8(src_2), vget_high_u8(pred_2));
192         abs_1 = vabal_u8(abs_1, vget_low_u8(src_3), vget_low_u8(pred_3));
193         abs_1 = vabal_u8(abs_1, vget_high_u8(src_3), vget_high_u8(pred_3));
194         pu1_src += src_strd;
195         pu1_pred += pred_strd;
196     }
197     tmp_a = vpaddlq_u16(abs_0);
198     tmp_a = vpadalq_u16(tmp_a, abs_1);
199     tmp_b = vpaddlq_u32(tmp_a);
200     sad = vadd_u32(
201         vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
202     return vget_lane_u32(sad, 0);
203 }
204 
ihevce_4mx4n_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 blk_wd,WORD32 blk_ht)205 UWORD32 ihevce_4mx4n_sad_computer_neon(
206     UWORD8 *pu1_src,
207     UWORD8 *pu1_pred,
208     WORD32 src_strd,
209     WORD32 pred_strd,
210     WORD32 blk_wd,
211     WORD32 blk_ht)
212 {
213     WORD32 sad = 0;
214     WORD32 i, j;
215 
216     assert(blk_wd % 4 == 0);
217     assert(blk_ht % 4 == 0);
218 
219     if(((blk_wd & (blk_wd - 1)) == 0) && (blk_wd <= 64))
220     {
221         // blk_wd { 4, 8, 16, 32, 64 }
222         for(i = 0; i < blk_ht;)
223         {
224             WORD32 ht = MIN(blk_wd, blk_ht - i);
225 
226             switch(blk_wd)
227             {
228             case 4:
229                 sad += ihevce_4x4_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd);
230                 break;
231             case 8:
232                 sad += ihevce_8xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
233                 break;
234             case 16:
235                 sad += ihevce_16xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
236                 break;
237             case 32:
238                 sad += ihevce_32xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
239                 break;
240             case 64:
241                 sad += ihevce_64xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
242                 break;
243             default:
244                 // should not be here
245                 return -1;
246             }
247             i += ht;
248             pu1_src += (ht * src_strd);
249             pu1_pred += (ht * pred_strd);
250         }
251     }
252     else
253     {
254         // Generic Case
255         for(i = 0; i < blk_ht; i += 4)
256         {
257             for(j = 0; j < blk_wd;)
258             {
259                 WORD32 wd = blk_wd - j;
260 
261                 if(wd >= 32)
262                 {
263                     sad += ihevce_32xn_sad_computer_neon(
264                         pu1_src + j, pu1_pred + j, src_strd, pred_strd, 4);
265                     j += 32;
266                 }
267                 else if(wd >= 16)
268                 {
269                     sad += ihevce_16xn_sad_computer_neon(
270                         pu1_src + j, pu1_pred + j, src_strd, pred_strd, 4);
271                     j += 16;
272                 }
273                 else if(wd >= 8)
274                 {
275                     sad += ihevce_8xn_sad_computer_neon(
276                         pu1_src + j, pu1_pred + j, src_strd, pred_strd, 4);
277                     j += 8;
278                 }
279                 else
280                 {
281                     sad += ihevce_4x4_sad_computer_neon(
282                         pu1_src + j, pu1_pred + j, src_strd, pred_strd);
283                     j += 4;
284                 }
285             }
286             pu1_src += (4 * src_strd);
287             pu1_pred += (4 * pred_strd);
288         }
289     }
290     return sad;
291 }
292 
ihevce_8x8_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd)293 UWORD16 ihevce_8x8_sad_computer_neon(
294     UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
295 {
296     return ihevce_8xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 8);
297 }
298 
ihevce_nxn_sad_computer_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD32 trans_size)299 WORD32 ihevce_nxn_sad_computer_neon(
300     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, WORD32 trans_size)
301 {
302     switch(trans_size)
303     {
304     case 4:
305         return ihevce_4x4_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd);
306     case 8:
307         return ihevce_8xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 8);
308     case 16:
309         return ihevce_16xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 16);
310     case 32:
311         return ihevce_32xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 32);
312     case 64:
313         return ihevce_64xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 64);
314     default:
315         // should not be here
316         return -1;
317     }
318 }
319