1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_had_compute_neon.c
24 *
25 * @brief
26 *  Contains intrinsic definitions of functions for computing had
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 
39 /*****************************************************************************/
40 /* File Includes                                                             */
41 /*****************************************************************************/
42 /* System include files */
43 #include <string.h>
44 #include <assert.h>
45 #include <arm_neon.h>
46 
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "itt_video_api.h"
50 #include "ihevc_cmn_utils_neon.h"
51 #include "ihevce_had_satd.h"
52 #include "ihevce_cmn_utils_instr_set_router.h"
53 
54 /*****************************************************************************/
55 /* Globals                                                                   */
56 /*****************************************************************************/
57 const int16_t gu2_dc_mask[8] = { 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff };
58 
59 /*****************************************************************************/
60 /* Function Macros                                                           */
61 /*****************************************************************************/
62 #define RESIDUE(k, is_chroma)                                                                      \
63     if(!is_chroma)                                                                                 \
64     {                                                                                              \
65         const uint8x8_t s##k = vld1_u8(pu1_src);                                                   \
66         const uint8x8_t p##k = vld1_u8(pu1_pred);                                                  \
67         *r##k = vreinterpretq_s16_u16(vsubl_u8(s##k, p##k));                                       \
68         pu1_src += src_strd;                                                                       \
69         pu1_pred += pred_strd;                                                                     \
70     }                                                                                              \
71     else                                                                                           \
72     {                                                                                              \
73         const uint8x8_t s##k = vld2_u8(pu1_src).val[0];                                            \
74         const uint8x8_t p##k = vld2_u8(pu1_pred).val[0];                                           \
75         *r##k = vreinterpretq_s16_u16(vsubl_u8(s##k, p##k));                                       \
76         pu1_src += src_strd;                                                                       \
77         pu1_pred += pred_strd;                                                                     \
78     }
79 
80 /*****************************************************************************/
81 /* Function Definitions                                                      */
82 /*****************************************************************************/
83 
84 static INLINE void
hadamard4x4_2_one_pass(int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3)85     hadamard4x4_2_one_pass(int16x8_t *r0, int16x8_t *r1, int16x8_t *r2, int16x8_t *r3)
86 {
87     const int16x8_t a0 = vaddq_s16(*r0, *r2);
88     const int16x8_t a1 = vaddq_s16(*r1, *r3);
89     const int16x8_t a2 = vsubq_s16(*r0, *r2);
90     const int16x8_t a3 = vsubq_s16(*r1, *r3);
91 
92     *r0 = vaddq_s16(a0, a1);
93     *r1 = vsubq_s16(a0, a1);
94     *r2 = vaddq_s16(a2, a3);
95     *r3 = vsubq_s16(a2, a3);
96 }
97 
hadamard4x4_2(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3)98 static INLINE void hadamard4x4_2(
99     UWORD8 *pu1_src,
100     WORD32 src_strd,
101     UWORD8 *pu1_pred,
102     WORD32 pred_strd,
103     int16x8_t *r0,
104     int16x8_t *r1,
105     int16x8_t *r2,
106     int16x8_t *r3)
107 {
108     // compute error between src and pred
109     RESIDUE(0, 0);
110     RESIDUE(1, 0);
111     RESIDUE(2, 0);
112     RESIDUE(3, 0);
113 
114     // vertical hadamard tx
115     hadamard4x4_2_one_pass(r0, r1, r2, r3);
116 
117     // transpose
118     transpose_s16_4x4q(r0, r1, r2, r3);
119 
120     // horizontal hadamard tx
121     hadamard4x4_2_one_pass(r0, r1, r2, r3);
122 }
123 
hadamard4x4_4(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3,int16x8_t * r4,int16x8_t * r5,int16x8_t * r6,int16x8_t * r7)124 static INLINE void hadamard4x4_4(
125     UWORD8 *pu1_src,
126     WORD32 src_strd,
127     UWORD8 *pu1_pred,
128     WORD32 pred_strd,
129     int16x8_t *r0,
130     int16x8_t *r1,
131     int16x8_t *r2,
132     int16x8_t *r3,
133     int16x8_t *r4,
134     int16x8_t *r5,
135     int16x8_t *r6,
136     int16x8_t *r7)
137 {
138     // hadamard 4x4_2n
139     hadamard4x4_2(pu1_src, src_strd, pu1_pred, pred_strd, r0, r1, r2, r3);
140 
141     // hadamard 4x4_2n
142     pu1_src += (4 * src_strd);
143     pu1_pred += (4 * pred_strd);
144     hadamard4x4_2(pu1_src, src_strd, pu1_pred, pred_strd, r4, r5, r6, r7);
145 }
146 
hadamard_sad4x4_4(int16x8_t * a,WORD32 * pi4_hsad,WORD32 hsad_stride)147 static INLINE WORD32 hadamard_sad4x4_4(int16x8_t *a, WORD32 *pi4_hsad, WORD32 hsad_stride)
148 {
149     int16x8_t p[8];
150     int32x4_t b01, b23;
151     int64x2_t c01, c23;
152     int32x2_t d01, d23;
153 
154     // satd
155     p[0] = vabsq_s16(a[0]);
156     p[1] = vabsq_s16(a[1]);
157     p[0] = vaddq_s16(p[0], p[1]);
158     p[2] = vabsq_s16(a[2]);
159     p[3] = vabsq_s16(a[3]);
160     p[2] = vaddq_s16(p[2], p[3]);
161 
162     p[4] = vabsq_s16(a[4]);
163     p[5] = vabsq_s16(a[5]);
164     p[4] = vaddq_s16(p[4], p[5]);
165     p[6] = vabsq_s16(a[6]);
166     p[7] = vabsq_s16(a[7]);
167     p[6] = vaddq_s16(p[6], p[7]);
168 
169     p[0] = vaddq_s16(p[0], p[2]);
170     b01 = vpaddlq_s16(p[0]);
171     c01 = vpaddlq_s32(b01);
172     d01 = vrshrn_n_s64(c01, 2);
173     vst1_s32(pi4_hsad, d01);
174     pi4_hsad += hsad_stride;
175 
176     p[4] = vaddq_s16(p[4], p[6]);
177     b23 = vpaddlq_s16(p[4]);
178     c23 = vpaddlq_s32(b23);
179     d23 = vrshrn_n_s64(c23, 2);
180     vst1_s32(pi4_hsad, d23);
181 
182     d01 = vadd_s32(d01, d23);
183 
184     return (WORD32)(vget_lane_s64(vpaddl_s32(d01), 0));
185 }
186 
hadamard_sad8x8_using4x4(int16x8_t * a,WORD32 * early_cbf,WORD32 i4_frm_qstep)187 static INLINE WORD32 hadamard_sad8x8_using4x4(int16x8_t *a, WORD32 *early_cbf, WORD32 i4_frm_qstep)
188 {
189     int16x8_t p[8];
190     const int16x8_t threshold = vdupq_n_s16((int16_t)(i4_frm_qstep >> 8));
191     int32x4_t b;
192     int64x2_t c;
193     int64_t satd;
194     WORD32 i;
195 
196     for(i = 0; i < 4; i++)
197     {
198         int16x8_t p0 = vaddq_s16(a[i], a[i + 4]);
199         int16x8_t p1 = vsubq_s16(a[i], a[i + 4]);
200 
201         int16x4_t q0 = vadd_s16(vget_low_s16(p0), vget_high_s16(p0));
202         int16x4_t q1 = vsub_s16(vget_low_s16(p0), vget_high_s16(p0));
203         int16x4_t q2 = vadd_s16(vget_low_s16(p1), vget_high_s16(p1));
204         int16x4_t q3 = vsub_s16(vget_low_s16(p1), vget_high_s16(p1));
205 
206         a[i] = vcombine_s16(q0, q2);
207         a[i + 4] = vcombine_s16(q1, q3);
208     }
209 
210 #define EARLY_EXIT(k)                                                                              \
211     {                                                                                              \
212         p[k] = vabsq_s16(a[k]);                                                                    \
213         if(*early_cbf == 0)                                                                        \
214         {                                                                                          \
215             uint16x8_t cmp;                                                                        \
216             cmp = vcgtq_s16(p[k], threshold);                                                      \
217             if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp)), 0) ||                        \
218                vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp)), 0))                         \
219             {                                                                                      \
220                 *early_cbf = 1;                                                                    \
221             }                                                                                      \
222         }                                                                                          \
223     }
224     // satd
225     EARLY_EXIT(0);
226     EARLY_EXIT(1);
227     p[0] = vaddq_s16(p[0], p[1]);
228     EARLY_EXIT(2);
229     EARLY_EXIT(3);
230     p[2] = vaddq_s16(p[2], p[3]);
231 
232     EARLY_EXIT(4);
233     EARLY_EXIT(5);
234     p[4] = vaddq_s16(p[4], p[5]);
235     EARLY_EXIT(6);
236     EARLY_EXIT(7);
237 #undef EARLY_EXIT
238     p[6] = vaddq_s16(p[6], p[7]);
239 
240     p[0] = vaddq_s16(p[0], p[2]);
241     p[4] = vaddq_s16(p[4], p[6]);
242     p[0] = vaddq_s16(p[0], p[4]);
243     b = vpaddlq_s16(p[0]);
244     c = vpaddlq_s32(b);
245     satd = vget_lane_s64(vadd_s64(vget_low_s64(c), vget_high_s64(c)), 0);
246 
247     return ((satd + 4) >> 3);
248 }
249 
hadamard8x8_one_pass(int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3,int16x8_t * r4,int16x8_t * r5,int16x8_t * r6,int16x8_t * r7)250 static INLINE void hadamard8x8_one_pass(
251     int16x8_t *r0,
252     int16x8_t *r1,
253     int16x8_t *r2,
254     int16x8_t *r3,
255     int16x8_t *r4,
256     int16x8_t *r5,
257     int16x8_t *r6,
258     int16x8_t *r7)
259 {
260     const int16x8_t a0 = vaddq_s16(*r0, *r4);
261     const int16x8_t a4 = vsubq_s16(*r0, *r4);
262     const int16x8_t a1 = vaddq_s16(*r1, *r5);
263     const int16x8_t a5 = vsubq_s16(*r1, *r5);
264     const int16x8_t a2 = vaddq_s16(*r2, *r6);
265     const int16x8_t a6 = vsubq_s16(*r2, *r6);
266     const int16x8_t a3 = vaddq_s16(*r3, *r7);
267     const int16x8_t a7 = vsubq_s16(*r3, *r7);
268 
269     const int16x8_t b0 = vaddq_s16(a0, a2);
270     const int16x8_t b2 = vsubq_s16(a0, a2);
271     const int16x8_t b1 = vaddq_s16(a1, a3);
272     const int16x8_t b3 = vsubq_s16(a1, a3);
273     const int16x8_t b4 = vaddq_s16(a4, a6);
274     const int16x8_t b6 = vsubq_s16(a4, a6);
275     const int16x8_t b5 = vaddq_s16(a5, a7);
276     const int16x8_t b7 = vsubq_s16(a5, a7);
277 
278     *r0 = vaddq_s16(b0, b1);
279     *r1 = vsubq_s16(b0, b1);
280     *r2 = vaddq_s16(b2, b3);
281     *r3 = vsubq_s16(b2, b3);
282     *r4 = vaddq_s16(b4, b5);
283     *r5 = vsubq_s16(b4, b5);
284     *r6 = vaddq_s16(b6, b7);
285     *r7 = vsubq_s16(b6, b7);
286 }
287 
hadamard8x8(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3,int16x8_t * r4,int16x8_t * r5,int16x8_t * r6,int16x8_t * r7,WORD32 is_chroma)288 static INLINE void hadamard8x8(
289     UWORD8 *pu1_src,
290     WORD32 src_strd,
291     UWORD8 *pu1_pred,
292     WORD32 pred_strd,
293     int16x8_t *r0,
294     int16x8_t *r1,
295     int16x8_t *r2,
296     int16x8_t *r3,
297     int16x8_t *r4,
298     int16x8_t *r5,
299     int16x8_t *r6,
300     int16x8_t *r7,
301     WORD32 is_chroma)
302 {
303     // compute error between src and pred
304     RESIDUE(0, is_chroma);
305     RESIDUE(1, is_chroma);
306     RESIDUE(2, is_chroma);
307     RESIDUE(3, is_chroma);
308     RESIDUE(4, is_chroma);
309     RESIDUE(5, is_chroma);
310     RESIDUE(6, is_chroma);
311     RESIDUE(7, is_chroma);
312 
313     // vertical hadamard tx
314     hadamard8x8_one_pass(r0, r1, r2, r3, r4, r5, r6, r7);
315 
316     // transpose
317     transpose_s16_8x8(r0, r1, r2, r3, r4, r5, r6, r7);
318 
319     // horizontal hadamard tx
320     hadamard8x8_one_pass(r0, r1, r2, r3, r4, r5, r6, r7);
321 }
322 
ihevce_HAD_8x8_8bit_plane_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD32 is_chroma,WORD32 ac_only)323 static INLINE UWORD32 ihevce_HAD_8x8_8bit_plane_neon(
324     UWORD8 *pu1_src,
325     WORD32 src_strd,
326     UWORD8 *pu1_pred,
327     WORD32 pred_strd,
328     WORD32 is_chroma,
329     WORD32 ac_only)
330 {
331     int16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
332     int32x4_t b;
333     int64x2_t c;
334     int64_t satd;
335 
336     // hadamard 8x8
337     hadamard8x8(
338         pu1_src, src_strd, pu1_pred, pred_strd, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, is_chroma);
339 
340     if(ac_only)
341     {
342         const int16x8_t mask = vld1q_s16(gu2_dc_mask);
343         a0 = vandq_s16(a0, mask);
344     }
345 
346     // satd
347     a0 = vabsq_s16(a0);
348     a1 = vabsq_s16(a1);
349     a0 = vaddq_s16(a0, a1);
350     a2 = vabsq_s16(a2);
351     a3 = vabsq_s16(a3);
352     a2 = vaddq_s16(a2, a3);
353 
354     a4 = vabsq_s16(a4);
355     a5 = vabsq_s16(a5);
356     a4 = vaddq_s16(a4, a5);
357     a6 = vabsq_s16(a6);
358     a7 = vabsq_s16(a7);
359     a6 = vaddq_s16(a6, a7);
360 
361     a0 = vaddq_s16(a0, a2);
362     a4 = vaddq_s16(a4, a6);
363     a0 = vaddq_s16(a0, a4);
364     b = vpaddlq_s16(a0);
365     c = vpaddlq_s32(b);
366     satd = vget_lane_s64(vadd_s64(vget_low_s64(c), vget_high_s64(c)), 0);
367 
368     return ((satd + 4) >> 3);
369 }
370 
ihevce_HAD_4x4_8bit_plane_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD32 is_chroma,WORD32 ac_only)371 static INLINE UWORD32 ihevce_HAD_4x4_8bit_plane_neon(
372     UWORD8 *pu1_src,
373     WORD32 src_strd,
374     UWORD8 *pu1_pred,
375     WORD32 pred_strd,
376     WORD32 is_chroma,
377     WORD32 ac_only)
378 {
379     uint8x16_t src_u8, pred_u8;
380     int16x8_t res_01, res_23;
381     int16x4_t h[4];
382     int16x4_t v[4];
383     int16x4x2_t trans_4[2];
384     int16x8_t combined_rows[4];
385     int32x4x2_t trans_8;
386     int32x4_t sad_32_4[3];
387     int32x2_t sad_32_2;
388     int64x1_t sad_64_1;
389     int32_t sad;
390 
391     if(!is_chroma)
392     {
393         src_u8 = load_unaligned_u8q(pu1_src, src_strd);
394         pred_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
395     }
396     else
397     {
398         src_u8 = load_unaligned_u8qi(pu1_src, src_strd);
399         pred_u8 = load_unaligned_u8qi(pu1_pred, pred_strd);
400     }
401     res_01 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(src_u8), vget_low_u8(pred_u8)));
402     res_23 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(src_u8), vget_high_u8(pred_u8)));
403 
404     h[0] = vadd_s16(vget_low_s16(res_01), vget_high_s16(res_23));
405     h[1] = vadd_s16(vget_high_s16(res_01), vget_low_s16(res_23));
406     h[2] = vsub_s16(vget_high_s16(res_01), vget_low_s16(res_23));
407     h[3] = vsub_s16(vget_low_s16(res_01), vget_high_s16(res_23));
408 
409     v[0] = vadd_s16(h[0], h[1]);
410     v[1] = vadd_s16(h[3], h[2]);
411     v[2] = vsub_s16(h[0], h[1]);
412     v[3] = vsub_s16(h[3], h[2]);
413 
414     trans_4[0] = vtrn_s16(v[0], v[2]);
415     trans_4[1] = vtrn_s16(v[1], v[3]);
416 
417     combined_rows[0] = vcombine_s16(trans_4[0].val[0], trans_4[1].val[0]);
418     combined_rows[1] = vcombine_s16(trans_4[0].val[1], trans_4[1].val[1]);
419 
420     combined_rows[2] = vaddq_s16(combined_rows[0], combined_rows[1]);
421     combined_rows[3] = vsubq_s16(combined_rows[0], combined_rows[1]);
422 
423     trans_8 =
424         vtrnq_s32(vreinterpretq_s32_s16(combined_rows[2]), vreinterpretq_s32_s16(combined_rows[3]));
425 
426     combined_rows[0] =
427         vaddq_s16(vreinterpretq_s16_s32(trans_8.val[0]), vreinterpretq_s16_s32(trans_8.val[1]));
428     combined_rows[0] = vabsq_s16(combined_rows[0]);
429     combined_rows[1] =
430         vsubq_s16(vreinterpretq_s16_s32(trans_8.val[0]), vreinterpretq_s16_s32(trans_8.val[1]));
431     combined_rows[1] = vabsq_s16(combined_rows[1]);
432 
433     if(ac_only)
434     {
435         const int16x8_t mask = vld1q_s16(gu2_dc_mask);
436         combined_rows[0] = vandq_s16(combined_rows[0], mask);
437     }
438 
439     sad_32_4[0] = vpaddlq_s16(combined_rows[0]);
440     sad_32_4[1] = vpaddlq_s16(combined_rows[1]);
441     sad_32_4[2] = vaddq_s32(sad_32_4[0], sad_32_4[1]);
442     sad_32_2 = vadd_s32(vget_high_s32(sad_32_4[2]), vget_low_s32(sad_32_4[2]));
443     sad_64_1 = vpaddl_s32(sad_32_2);
444     sad = vget_lane_s64(sad_64_1, 0);
445 
446     return ((sad + 2) >> 2);
447 }
448 
ihevce_HAD_4x4_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)449 UWORD32 ihevce_HAD_4x4_8bit_neon(
450     UWORD8 *pu1_src,
451     WORD32 src_strd,
452     UWORD8 *pu1_pred,
453     WORD32 pred_strd,
454     WORD16 *pi2_dst,
455     WORD32 dst_strd)
456 {
457     (void)pi2_dst;
458     (void)dst_strd;
459     return ihevce_HAD_4x4_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 0, 0);
460 }
461 
ihevce_chroma_compute_AC_HAD_4x4_8bit_neon(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)462 UWORD32 ihevce_chroma_compute_AC_HAD_4x4_8bit_neon(
463     UWORD8 *pu1_origin,
464     WORD32 src_strd,
465     UWORD8 *pu1_pred_buf,
466     WORD32 pred_strd,
467     WORD16 *pi2_dst,
468     WORD32 dst_strd)
469 {
470     (void)pi2_dst;
471     (void)dst_strd;
472     return ihevce_HAD_4x4_8bit_plane_neon(pu1_origin, src_strd, pu1_pred_buf, pred_strd, 1, 1);
473 }
474 
ihevce_HAD_8x8_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)475 UWORD32 ihevce_HAD_8x8_8bit_neon(
476     UWORD8 *pu1_src,
477     WORD32 src_strd,
478     UWORD8 *pu1_pred,
479     WORD32 pred_strd,
480     WORD16 *pi2_dst,
481     WORD32 dst_strd)
482 {
483     (void)pi2_dst;
484     (void)dst_strd;
485     return ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 0, 0);
486 }
487 
ihevce_compute_ac_had_8x8_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)488 UWORD32 ihevce_compute_ac_had_8x8_8bit_neon(
489     UWORD8 *pu1_src,
490     WORD32 src_strd,
491     UWORD8 *pu1_pred,
492     WORD32 pred_strd,
493     WORD16 *pi2_dst,
494     WORD32 dst_strd)
495 {
496     (void)pi2_dst;
497     (void)dst_strd;
498     return ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 0, 1);
499 }
500 
ihevce_HAD_16x16_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)501 UWORD32 ihevce_HAD_16x16_8bit_neon(
502     UWORD8 *pu1_src,
503     WORD32 src_strd,
504     UWORD8 *pu1_pred,
505     WORD32 pred_strd,
506     WORD16 *pi2_dst,
507     WORD32 dst_strd)
508 {
509     int16x8_t b0[8];
510     int16x8_t b1[8];
511     int16x8_t b2[8];
512     int16x8_t b3[8];
513     uint32x4_t sum = vdupq_n_u32(0);
514     uint64x2_t c;
515     uint64_t satd;
516     WORD32 i;
517 
518     (void)pi2_dst;
519     (void)dst_strd;
520 
521     // hadamard 8x8 - b0
522     hadamard8x8(
523         pu1_src,
524         src_strd,
525         pu1_pred,
526         pred_strd,
527         &b0[0],
528         &b0[1],
529         &b0[2],
530         &b0[3],
531         &b0[4],
532         &b0[5],
533         &b0[6],
534         &b0[7],
535         0);
536     // hadamard 8x8 - b1
537     hadamard8x8(
538         pu1_src + 8,
539         src_strd,
540         pu1_pred + 8,
541         pred_strd,
542         &b1[0],
543         &b1[1],
544         &b1[2],
545         &b1[3],
546         &b1[4],
547         &b1[5],
548         &b1[6],
549         &b1[7],
550         0);
551     // hadamard 8x8 - b2
552     hadamard8x8(
553         pu1_src + (8 * src_strd),
554         src_strd,
555         pu1_pred + (8 * pred_strd),
556         pred_strd,
557         &b2[0],
558         &b2[1],
559         &b2[2],
560         &b2[3],
561         &b2[4],
562         &b2[5],
563         &b2[6],
564         &b2[7],
565         0);
566     // hadamard 8x8 - b3
567     hadamard8x8(
568         pu1_src + (8 * src_strd) + 8,
569         src_strd,
570         pu1_pred + (8 * pred_strd) + 8,
571         pred_strd,
572         &b3[0],
573         &b3[1],
574         &b3[2],
575         &b3[3],
576         &b3[4],
577         &b3[5],
578         &b3[6],
579         &b3[7],
580         0);
581 
582     for(i = 0; i < 8; i++)
583     {
584         int16x8_t p0 = vhaddq_s16(b0[i], b1[i]);
585         int16x8_t p1 = vhsubq_s16(b0[i], b1[i]);
586         int16x8_t p2 = vhaddq_s16(b2[i], b3[i]);
587         int16x8_t p3 = vhsubq_s16(b2[i], b3[i]);
588 
589         int16x8_t q0 = vaddq_s16(p0, p2);
590         int16x8_t q1 = vsubq_s16(p0, p2);
591         int16x8_t q2 = vaddq_s16(p1, p3);
592         int16x8_t q3 = vsubq_s16(p1, p3);
593 
594         uint16x8_t r0 =
595             vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(q0)), vreinterpretq_u16_s16(vabsq_s16(q1)));
596         uint16x8_t r1 =
597             vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(q2)), vreinterpretq_u16_s16(vabsq_s16(q3)));
598 
599         uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
600         uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
601 
602         sum = vaddq_u32(sum, s0);
603         sum = vaddq_u32(sum, s1);
604     }
605 
606     c = vpaddlq_u32(sum);
607     satd = vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
608 
609     return ((satd + 4) >> 3);
610 }
611 
ihevce_chroma_HAD_4x4_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)612 UWORD32 ihevce_chroma_HAD_4x4_8bit_neon(
613     UWORD8 *pu1_src,
614     WORD32 src_strd,
615     UWORD8 *pu1_pred,
616     WORD32 pred_strd,
617     WORD16 *pi2_dst,
618     WORD32 dst_strd)
619 {
620     (void)pi2_dst;
621     (void)dst_strd;
622     return ihevce_HAD_4x4_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 1, 0);
623 }
624 
ihevce_chroma_HAD_8x8_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)625 UWORD32 ihevce_chroma_HAD_8x8_8bit_neon(
626     UWORD8 *pu1_src,
627     WORD32 src_strd,
628     UWORD8 *pu1_pred,
629     WORD32 pred_strd,
630     WORD16 *pi2_dst,
631     WORD32 dst_strd)
632 {
633     (void)pi2_dst;
634     (void)dst_strd;
635     return ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 1, 0);
636 }
637 
ihevce_chroma_HAD_16x16_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)638 UWORD32 ihevce_chroma_HAD_16x16_8bit_neon(
639     UWORD8 *pu1_src,
640     WORD32 src_strd,
641     UWORD8 *pu1_pred,
642     WORD32 pred_strd,
643     WORD16 *pi2_dst,
644     WORD32 dst_strd)
645 {
646     UWORD32 au4_satd[4];
647 
648     (void)pi2_dst;
649     (void)dst_strd;
650     au4_satd[0] = ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 1, 0);
651     au4_satd[1] =
652         ihevce_HAD_8x8_8bit_plane_neon(pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, 1, 0);
653     au4_satd[2] = ihevce_HAD_8x8_8bit_plane_neon(
654         pu1_src + 8 * src_strd, src_strd, pu1_pred + 8 * pred_strd, pred_strd, 1, 0);
655     au4_satd[3] = ihevce_HAD_8x8_8bit_plane_neon(
656         pu1_src + 8 * src_strd + 16, src_strd, pu1_pred + 8 * pred_strd + 16, pred_strd, 1, 0);
657 
658     return au4_satd[0] + au4_satd[1] + au4_satd[2] + au4_satd[3];
659 }
660 
ihevce_HAD_32x32_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)661 UWORD32 ihevce_HAD_32x32_8bit_neon(
662     UWORD8 *pu1_src,
663     WORD32 src_strd,
664     UWORD8 *pu1_pred,
665     WORD32 pred_strd,
666     WORD16 *pi2_dst,
667     WORD32 dst_strd)
668 {
669     int16x8_t a[4][4][8];
670     uint32x4_t sum = vdupq_n_u32(0);
671     WORD32 b8, b16;
672     uint64x2_t c;
673     uint64_t satd;
674     WORD32 i, j;
675 
676     (void)pi2_dst;
677     (void)dst_strd;
678     // hadamard 32x32
679     for(b16 = 0; b16 < 4; b16++)
680     {
681         UWORD8 *pu1_src_b16 = pu1_src + (b16 >> 1) * (src_strd * 16) + ((b16 & 1) * 16);
682         UWORD8 *pu1_pred_b16 = pu1_pred + (b16 >> 1) * (pred_strd * 16) + ((b16 & 1) * 16);
683         // hadamard 16x16
684         for(b8 = 0; b8 < 4; b8++)
685         {
686             UWORD8 *pu1_src_b8 = pu1_src_b16 + (b8 >> 1) * (src_strd * 8) + ((b8 & 1) * 8);
687             UWORD8 *pu1_pred_b8 = pu1_pred_b16 + (b8 >> 1) * (pred_strd * 8) + ((b8 & 1) * 8);
688             // hadamard 8x8
689             hadamard8x8(
690                 pu1_src_b8,
691                 src_strd,
692                 pu1_pred_b8,
693                 pred_strd,
694                 &a[b16][b8][0],
695                 &a[b16][b8][1],
696                 &a[b16][b8][2],
697                 &a[b16][b8][3],
698                 &a[b16][b8][4],
699                 &a[b16][b8][5],
700                 &a[b16][b8][6],
701                 &a[b16][b8][7],
702                 0);
703         }
704         for(i = 0; i < 8; i++)
705         {
706             int16x8_t p0 = vhaddq_s16(a[b16][0][i], a[b16][1][i]);
707             int16x8_t p1 = vhsubq_s16(a[b16][0][i], a[b16][1][i]);
708             int16x8_t p2 = vhaddq_s16(a[b16][2][i], a[b16][3][i]);
709             int16x8_t p3 = vhsubq_s16(a[b16][2][i], a[b16][3][i]);
710 
711             a[b16][0][i] = vaddq_s16(p0, p2);
712             a[b16][1][i] = vsubq_s16(p0, p2);
713             a[b16][2][i] = vaddq_s16(p1, p3);
714             a[b16][3][i] = vsubq_s16(p1, p3);
715 
716             a[b16][0][i] = vshrq_n_s16(a[b16][0][i], 2);
717             a[b16][1][i] = vshrq_n_s16(a[b16][1][i], 2);
718             a[b16][2][i] = vshrq_n_s16(a[b16][2][i], 2);
719             a[b16][3][i] = vshrq_n_s16(a[b16][3][i], 2);
720         }
721     }
722     for(j = 0; j < 4; j++)
723     {
724         for(i = 0; i < 8; i++)
725         {
726             int16x8_t p0 = vaddq_s16(a[0][j][i], a[1][j][i]);
727             int16x8_t p1 = vsubq_s16(a[0][j][i], a[1][j][i]);
728             int16x8_t p2 = vaddq_s16(a[2][j][i], a[3][j][i]);
729             int16x8_t p3 = vsubq_s16(a[2][j][i], a[3][j][i]);
730 
731             int16x8_t q0 = vaddq_s16(p0, p2);
732             int16x8_t q1 = vsubq_s16(p0, p2);
733             int16x8_t q2 = vaddq_s16(p1, p3);
734             int16x8_t q3 = vsubq_s16(p1, p3);
735 
736             uint16x8_t r0 = vaddq_u16(
737                 vreinterpretq_u16_s16(vabsq_s16(q0)), vreinterpretq_u16_s16(vabsq_s16(q1)));
738             uint16x8_t r1 = vaddq_u16(
739                 vreinterpretq_u16_s16(vabsq_s16(q2)), vreinterpretq_u16_s16(vabsq_s16(q3)));
740 
741             uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
742             uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
743 
744             sum = vaddq_u32(sum, s0);
745             sum = vaddq_u32(sum, s1);
746         }
747     }
748     c = vpaddlq_u32(sum);
749     satd = vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
750 
751     return ((satd + 2) >> 2);
752 }
753 
ihevce_had4_4x4_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst4x4,WORD32 dst_strd,WORD32 * pi4_hsad,WORD32 hsad_stride,WORD32 i4_frm_qstep)754 WORD32 ihevce_had4_4x4_neon(
755     UWORD8 *pu1_src,
756     WORD32 src_strd,
757     UWORD8 *pu1_pred,
758     WORD32 pred_strd,
759     WORD16 *pi2_dst4x4,
760     WORD32 dst_strd,
761     WORD32 *pi4_hsad,
762     WORD32 hsad_stride,
763     WORD32 i4_frm_qstep)
764 {
765     int16x8_t a[8];
766 
767     (void)pi2_dst4x4;
768     (void)dst_strd;
769     (void)i4_frm_qstep;
770 
771     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
772     hadamard4x4_4(
773         pu1_src,
774         src_strd,
775         pu1_pred,
776         pred_strd,
777         &a[0],
778         &a[1],
779         &a[2],
780         &a[3],
781         &a[4],
782         &a[5],
783         &a[6],
784         &a[7]);
785 
786     return hadamard_sad4x4_4(a, pi4_hsad, hsad_stride);
787 }
788 
ihevce_had_8x8_using_4_4x4_r_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)789 WORD32 ihevce_had_8x8_using_4_4x4_r_neon(
790     UWORD8 *pu1_src,
791     WORD32 src_strd,
792     UWORD8 *pu1_pred,
793     WORD32 pred_strd,
794     WORD16 *pi2_dst,
795     WORD32 dst_strd,
796     WORD32 **ppi4_hsad,
797     WORD32 **ppi4_tu_split,
798     WORD32 **ppi4_tu_early_cbf,
799     WORD32 pos_x_y_4x4,
800     WORD32 num_4x4_in_row,
801     WORD32 lambda,
802     WORD32 lambda_q_shift,
803     WORD32 i4_frm_qstep,
804     WORD32 i4_cur_depth,
805     WORD32 i4_max_depth,
806     WORD32 i4_max_tr_size,
807     WORD32 *pi4_tu_split_cost,
808     void *pv_func_sel)
809 {
810     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
811     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
812 
813     WORD32 *pi4_4x4_hsad;
814     WORD32 *pi4_8x8_hsad;
815     WORD32 *pi4_8x8_tu_split;
816     WORD32 *pi4_8x8_tu_early_cbf;
817 
818     WORD32 cost_child, cost_parent;
819     WORD32 best_cost;
820     WORD32 early_cbf = 0;
821     const UWORD8 u1_cur_tr_size = 8;
822 
823     WORD32 i;
824 
825     int16x8_t a[8];
826 
827     (void)pv_func_sel;
828 
829     assert(pos_x >= 0);
830     assert(pos_y >= 0);
831 
832     /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
833     pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
834     pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
835     pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
836     pi4_8x8_tu_early_cbf =
837         ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
838 
839     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
840     hadamard4x4_4(
841         pu1_src,
842         src_strd,
843         pu1_pred,
844         pred_strd,
845         &a[0],
846         &a[1],
847         &a[2],
848         &a[3],
849         &a[4],
850         &a[5],
851         &a[6],
852         &a[7]);
853 
854     /* -------- cost child -------- */
855     cost_child = hadamard_sad4x4_4(a, pi4_4x4_hsad, num_4x4_in_row);
856     /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
857     cost_child += ((4) * lambda) >> (lambda_q_shift + 1);
858 
859     /* -------- cost parent -------- */
860     cost_parent = hadamard_sad8x8_using4x4(a, &early_cbf, i4_frm_qstep);
861     for(i = 0; i < 8; i++, pi2_dst += dst_strd)
862         vst1q_s16(pi2_dst, a[i]);
863 
864     if(i4_cur_depth < i4_max_depth)
865     {
866         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
867         {
868             *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
869             best_cost = cost_child;
870             best_cost <<= 1;
871             best_cost++;
872             pi4_8x8_tu_split[0] = 1;
873             pi4_8x8_hsad[0] = cost_child;
874         }
875         else
876         {
877             best_cost = cost_parent;
878             best_cost <<= 1;
879             pi4_8x8_tu_split[0] = 0;
880             pi4_8x8_hsad[0] = cost_parent;
881         }
882     }
883     else
884     {
885         best_cost = cost_parent;
886         best_cost <<= 1;
887         pi4_8x8_tu_split[0] = 0;
888         pi4_8x8_hsad[0] = cost_parent;
889     }
890 
891     pi4_8x8_tu_early_cbf[0] = early_cbf;
892 
893     /* best cost has tu_split_flag at LSB(Least significant bit) */
894     return ((best_cost << 1) + early_cbf);
895 }
896 
ihevce_compute_16x16HAD_using_8x8_neon(WORD16 * pi2_8x8_had,WORD32 had8_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)897 static WORD32 ihevce_compute_16x16HAD_using_8x8_neon(
898     WORD16 *pi2_8x8_had,
899     WORD32 had8_strd,
900     WORD16 *pi2_dst,
901     WORD32 dst_strd,
902     WORD32 i4_frm_qstep,
903     WORD32 *pi4_cbf)
904 {
905     int16x8_t b0[8];
906     int16x8_t b1[8];
907     int16x8_t b2[8];
908     int16x8_t b3[8];
909     const int16x8_t threshold = vdupq_n_s16((int16_t)(i4_frm_qstep >> 8));
910     uint32x4_t sum = vdupq_n_u32(0);
911     uint64x2_t c;
912     uint64_t satd;
913     WORD32 i;
914 
915     for(i = 0; i < 8; i++, pi2_8x8_had += had8_strd)
916     {
917         b0[i] = vld1q_s16(pi2_8x8_had);
918         b1[i] = vld1q_s16(pi2_8x8_had + 8);
919     }
920     for(i = 0; i < 8; i++, pi2_8x8_had += had8_strd)
921     {
922         b2[i] = vld1q_s16(pi2_8x8_had);
923         b3[i] = vld1q_s16(pi2_8x8_had + 8);
924     }
925 
926 #define EARLY_EXIT(k)                                                                              \
927     {                                                                                              \
928         p##k = vabsq_s16(q##k);                                                                    \
929         if(*pi4_cbf == 0)                                                                          \
930         {                                                                                          \
931             uint16x8_t cmp;                                                                        \
932             cmp = vcgtq_s16(p##k, threshold);                                                      \
933             if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp)), 0) ||                        \
934                vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp)), 0))                         \
935             {                                                                                      \
936                 *pi4_cbf = 1;                                                                      \
937             }                                                                                      \
938         }                                                                                          \
939     }
940     for(i = 0; i < 8; i++, pi2_dst += dst_strd)
941     {
942         int16x8_t p0 = vhaddq_s16(b0[i], b1[i]);
943         int16x8_t p1 = vhsubq_s16(b0[i], b1[i]);
944         int16x8_t p2 = vhaddq_s16(b2[i], b3[i]);
945         int16x8_t p3 = vhsubq_s16(b2[i], b3[i]);
946 
947         int16x8_t q0 = vaddq_s16(p0, p2);
948         int16x8_t q1 = vsubq_s16(p0, p2);
949         int16x8_t q2 = vaddq_s16(p1, p3);
950         int16x8_t q3 = vsubq_s16(p1, p3);
951 
952         vst1q_s16(pi2_dst, q0);
953         EARLY_EXIT(0);
954         vst1q_s16(pi2_dst + 8, q1);
955         EARLY_EXIT(1);
956         vst1q_s16(pi2_dst + 8 * dst_strd, q2);
957         EARLY_EXIT(2);
958         vst1q_s16(pi2_dst + 8 * dst_strd + 8, q3);
959         EARLY_EXIT(3);
960         uint16x8_t r0 = vaddq_u16(vreinterpretq_u16_s16(p0), vreinterpretq_u16_s16(p1));
961         uint16x8_t r1 = vaddq_u16(vreinterpretq_u16_s16(p2), vreinterpretq_u16_s16(p3));
962 
963         uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
964         uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
965 
966         sum = vaddq_u32(sum, s0);
967         sum = vaddq_u32(sum, s1);
968     }
969 
970     c = vpaddlq_u32(sum);
971     satd = vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
972 
973     return ((satd + 4) >> 3);
974 }
975 
ihevce_had_16x16_r_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)976 WORD32 ihevce_had_16x16_r_neon(
977     UWORD8 *pu1_src,
978     WORD32 src_strd,
979     UWORD8 *pu1_pred,
980     WORD32 pred_strd,
981     WORD16 *pi2_dst,
982     WORD32 dst_strd,
983     WORD32 **ppi4_hsad,
984     WORD32 **ppi4_tu_split,
985     WORD32 **ppi4_tu_early_cbf,
986     WORD32 pos_x_y_4x4,
987     WORD32 num_4x4_in_row,
988     WORD32 lambda,
989     WORD32 lambda_q_shift,
990     WORD32 i4_frm_qstep,
991     WORD32 i4_cur_depth,
992     WORD32 i4_max_depth,
993     WORD32 i4_max_tr_size,
994     WORD32 *pi4_tu_split_cost,
995     void *pv_func_sel)
996 {
997     WORD16 ai2_8x8_had[256];
998 
999     WORD32 *pi4_16x16_hsad;
1000     WORD32 *pi4_16x16_tu_split;
1001     WORD32 *pi4_16x16_tu_early_cbf;
1002 
1003     WORD32 best_cost, best_cost_tu_split;
1004     WORD32 tu_split_flag = 0;
1005     WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1006     WORD32 cost_parent, cost_child = 0;
1007 
1008     const UWORD8 u1_cur_tr_size = 16;
1009 
1010     WORD32 i;
1011 
1012     WORD16 *pi2_y0;
1013     UWORD8 *src, *pred;
1014     WORD32 pos_x_y_4x4_0;
1015 
1016     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1017     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1018 
1019     assert(pos_x >= 0);
1020     assert(pos_y >= 0);
1021 
1022     /* Initialize pointers to  store 16x16 SATDs */
1023     pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1024 
1025     pi4_16x16_tu_split =
1026         ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1027 
1028     pi4_16x16_tu_early_cbf =
1029         ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1030 
1031     /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1032     for(i = 0; i < 4; i++)
1033     {
1034         src = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
1035         pred = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
1036         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1037         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1038 
1039         best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r_neon(
1040             src,
1041             src_strd,
1042             pred,
1043             pred_strd,
1044             pi2_y0,
1045             16,
1046             ppi4_hsad,
1047             ppi4_tu_split,
1048             ppi4_tu_early_cbf,
1049             pos_x_y_4x4_0,
1050             num_4x4_in_row,
1051             lambda,
1052             lambda_q_shift,
1053             i4_frm_qstep,
1054             i4_cur_depth + 1,
1055             i4_max_depth,
1056             i4_max_tr_size,
1057             pi4_tu_split_cost,
1058             pv_func_sel);
1059 
1060         /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
1061         best_cost = (best_cost_tu_split >> 2);
1062 
1063         /* Last but one bit stores the information regarding the TU_Split */
1064         tu_split_flag += (best_cost_tu_split & 0x3) >> 1;
1065 
1066         /* Last bit stores the information regarding the early_cbf */
1067         i4_early_cbf_flag += (best_cost_tu_split & 0x1);
1068 
1069         cost_child += best_cost;
1070 
1071         tu_split_flag <<= 1;
1072         i4_early_cbf_flag <<= 1;
1073     }
1074 
1075     /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
1076     pi2_y0 = ai2_8x8_had;
1077 
1078     /* Threshold currently passed as "0" */
1079     cost_parent = ihevce_compute_16x16HAD_using_8x8_neon(
1080         pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1081 
1082     /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1083     cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1084 
1085     i4_early_cbf_flag += early_cbf;
1086 
1087     /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
1088     which decides the extent to which TU_REC needs to be done */
1089     if(i4_cur_depth < i4_max_depth)
1090     {
1091         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1092         {
1093             *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1094             tu_split_flag += 1;
1095             best_cost = cost_child;
1096         }
1097         else
1098         {
1099             tu_split_flag += 0;
1100             best_cost = cost_parent;
1101         }
1102     }
1103     else
1104     {
1105         tu_split_flag += 0;
1106         best_cost = cost_parent;
1107     }
1108 
1109     pi4_16x16_hsad[0] = best_cost;
1110     pi4_16x16_tu_split[0] = tu_split_flag;
1111     pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;
1112 
1113     /*returning two values(best cost & tu_split_flag) as a single value*/
1114     return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
1115 }
1116 
ihevce_compute_32x32HAD_using_16x16_neon(WORD16 * pi2_16x16_had,WORD32 had16_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)1117 UWORD32 ihevce_compute_32x32HAD_using_16x16_neon(
1118     WORD16 *pi2_16x16_had,
1119     WORD32 had16_strd,
1120     WORD16 *pi2_dst,
1121     WORD32 dst_strd,
1122     WORD32 i4_frm_qstep,
1123     WORD32 *pi4_cbf)
1124 {
1125     int16x8_t a[4][4][8];
1126     uint32x4_t sum = vdupq_n_u32(0);
1127     const int16x8_t threshold = vdupq_n_s16((int16_t)(i4_frm_qstep >> 8));
1128     WORD32 b8, b16;
1129     uint64x2_t c;
1130     WORD32 i, j;
1131 
1132     (void)pi2_dst;
1133     (void)dst_strd;
1134 
1135     for(b16 = 0; b16 < 4; b16++)
1136     {
1137         WORD16 *pi2_b16 = pi2_16x16_had + (b16 >> 1) * (had16_strd * 16) + ((b16 & 1) * 16);
1138 
1139         for(b8 = 0; b8 < 4; b8++)
1140         {
1141             WORD16 *pi2_b8 = pi2_b16 + (b8 >> 1) * (had16_strd * 8) + ((b8 & 1) * 8);
1142 
1143             for(i = 0; i < 8; i++, pi2_b8 += had16_strd)
1144             {
1145                 a[b16][b8][i] = vld1q_s16(pi2_b8);
1146                 a[b16][b8][i] = vshrq_n_s16(a[b16][b8][i], 2);
1147             }
1148         }
1149     }
1150 
1151     for(j = 0; j < 4; j++)
1152     {
1153         for(i = 0; i < 8; i++)
1154         {
1155             int16x8_t p0 = vaddq_s16(a[0][j][i], a[1][j][i]);
1156             int16x8_t p1 = vsubq_s16(a[0][j][i], a[1][j][i]);
1157             int16x8_t p2 = vaddq_s16(a[2][j][i], a[3][j][i]);
1158             int16x8_t p3 = vsubq_s16(a[2][j][i], a[3][j][i]);
1159 
1160             int16x8_t q0 = vaddq_s16(p0, p2);
1161             int16x8_t q1 = vsubq_s16(p0, p2);
1162             int16x8_t q2 = vaddq_s16(p1, p3);
1163             int16x8_t q3 = vsubq_s16(p1, p3);
1164 
1165             EARLY_EXIT(0);
1166             EARLY_EXIT(1);
1167             EARLY_EXIT(2);
1168             EARLY_EXIT(3);
1169 
1170             uint16x8_t r0 = vaddq_u16(vreinterpretq_u16_s16(p0), vreinterpretq_u16_s16(p1));
1171             uint16x8_t r1 = vaddq_u16(vreinterpretq_u16_s16(p2), vreinterpretq_u16_s16(p3));
1172 
1173             uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
1174             uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
1175 
1176             sum = vaddq_u32(sum, s0);
1177             sum = vaddq_u32(sum, s1);
1178         }
1179     }
1180     c = vpaddlq_u32(sum);
1181 
1182     return vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
1183 }
1184