1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevc_weighted_pred_neon_intr.c
22 *
23 * @brief
24 *  Contains function definitions for weighted prediction used in inter
25 * prediction
26 *
27 * @author
28 *  Parthiban V
29 *
30 * @par List of Functions:
31 *  - ihevc_weighted_pred_uni()
32 *  - ihevc_weighted_pred_bi()
33 *  - ihevc_weighted_pred_bi_default()
34 *
35 * @remarks
36 *  None
37 *
38 *******************************************************************************
39 */
40 /*****************************************************************************/
41 /* File Includes                                                             */
42 /*****************************************************************************/
43 #include "ihevc_typedefs.h"
44 #include "ihevc_defs.h"
45 #include "ihevc_macros.h"
46 #include "ihevc_func_selector.h"
47 #include "ihevc_inter_pred.h"
48 #include "arm_neon.h"
49 
50 
51 /**
52 *******************************************************************************
53 *
54 * @brief
55 *  Does uni-weighted prediction on the array pointed by  pi2_src and stores
56 * it at the location pointed by pi2_dst Assumptions : The function is
57 * optimized considering the fact Width and  height are multiple of 2.
58 *
59 * @par Description:
60 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
61 * offset
62 *
63 * @param[in] pi2_src
64 *  Pointer to the source
65 *
66 * @param[out] pu1_dst
67 *  Pointer to the destination
68 *
69 * @param[in] src_strd
70 *  Source stride
71 *
72 * @param[in] dst_strd
73 *  Destination stride
74 *
75 * @param[in] wgt0
76 *  weight to be multiplied to the source
77 *
78 * @param[in] off0
79 *  offset to be added after rounding and
80 *
81 * @param[in] shifting
82 *
83 *
84 * @param[in] shift
85 *  (14 Bit depth) + log2_weight_denominator
86 *
87 * @param[in] lvl_shift
88 *  added before shift and offset
89 *
90 * @param[in] ht
91 *  height of the source
92 *
93 * @param[in] wd
94 *  width of the source
95 *
96 * @returns
97 *
98 * @remarks
99 *  None
100 *
101 *******************************************************************************
102 */
103 
ihevc_weighted_pred_uni_neonintr(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)104 void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
105                                       UWORD8 *pu1_dst,
106                                       WORD32 src_strd,
107                                       WORD32 dst_strd,
108                                       WORD32 wgt0,
109                                       WORD32 off0,
110                                       WORD32 shift,
111                                       WORD32 lvl_shift,
112                                       WORD32 ht,
113                                       WORD32 wd)
114 {
115     WORD32 row, col;
116     int16x4_t pi2_src_val1;
117     int16x4_t pi2_src_val2;
118     int32x4_t i4_tmp1_t;
119     int32x4_t i4_tmp2_t;
120     int32x4_t sto_res_tmp1;
121     uint16x4_t sto_res_tmp2;
122     uint16x8_t sto_res_tmp3;
123     uint8x8_t sto_res;
124     int32x4_t tmp_lvl_shift_t;
125     WORD32 tmp_shift = 0 - shift;
126     int32x4_t tmp_shift_t;
127     WORD16 *pi2_src_tmp;
128     UWORD8 *pu1_dst_tmp;
129 
130     WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
131     tmp_lvl_shift += (1 << (shift - 1));
132     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
133     tmp_shift_t = vmovq_n_s32(tmp_shift);
134 
135     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
136     /* height has also been unrolled, hence 2 rows will processed at a time                     */
137     /* store also has been taken care for two row process                                       */
138     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
139     /* saturated and narrowed                                                                   */
140 
141     for(row = ht; row > 0; row -= 2)
142     {
143         for(col = wd; col > 0; col -= 4)
144         {
145             pi2_src_tmp = pi2_src + src_strd;
146 
147             pu1_dst_tmp = pu1_dst + dst_strd;
148 
149             pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
150             pi2_src += 4;
151 
152             pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
153             i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);
154 
155             i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
156             i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);
157 
158             sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
159             i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);
160 
161             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
162             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
163 
164             sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
165             sto_res = vqmovn_u16(sto_res_tmp3);
166 
167             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
168             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
169 
170             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
171             pu1_dst += 4;
172 
173             sto_res = vqmovn_u16(sto_res_tmp3);
174             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
175         }
176         pi2_src += 2 * src_strd - wd;
177         pu1_dst += 2 * dst_strd - wd;
178     }
179 }
180 //WEIGHTED_PRED_UNI
181 
182 /**
183 *******************************************************************************
184 *
185 * @brief
186 * Chroma uni-weighted prediction on the array pointed by  pi2_src and stores
187 * it at the location pointed by pi2_dst Assumptions : The function is
188 * optimized considering the fact Width and  height are multiple of 2.
189 *
190 * @par Description:
191 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
192 * offset
193 *
194 * @param[in] pi2_src
195 *  Pointer to the source
196 *
197 * @param[out] pu1_dst
198 *  Pointer to the destination
199 *
200 * @param[in] src_strd
201 *  Source stride
202 *
203 * @param[in] dst_strd
204 *  Destination stride
205 *
206 * @param[in] wgt0
207 *  weight to be multiplied to the source
208 *
209 * @param[in] off0
210 *  offset to be added after rounding and
211 *
212 * @param[in] shifting
213 *
214 *
215 * @param[in] shift
216 *  (14 Bit depth) + log2_weight_denominator
217 *
218 * @param[in] lvl_shift
219 *  added before shift and offset
220 *
221 * @param[in] ht
222 *  height of the source
223 *
224 * @param[in] wd
225 *  width of the source
226 *
227 * @returns
228 *
229 * @remarks
230 *  None
231 *
232 *******************************************************************************
233 */
234 
ihevc_weighted_pred_chroma_uni_neonintr(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)235 void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
236                                              UWORD8 *pu1_dst,
237                                              WORD32 src_strd,
238                                              WORD32 dst_strd,
239                                              WORD32 wgt0_cb,
240                                              WORD32 wgt0_cr,
241                                              WORD32 off0_cb,
242                                              WORD32 off0_cr,
243                                              WORD32 shift,
244                                              WORD32 lvl_shift,
245                                              WORD32 ht,
246                                              WORD32 wd)
247 {
248     WORD32 row, col;
249     int16x4_t pi2_src_val1;
250     int16x4_t pi2_src_val2;
251     int32x4_t i4_tmp1_t;
252     int32x4_t i4_tmp2_t;
253     int32x4_t sto_res_tmp1;
254     uint16x4_t sto_res_tmp2;
255     uint16x8_t sto_res_tmp3;
256     uint8x8_t sto_res;
257     int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
258     int32x4x2_t tmp_lvl_shift_t;
259     WORD32 tmp_shift = 0 - shift;
260     int32x4_t tmp_shift_t;
261     int16x4_t tmp_wgt0_u, tmp_wgt0_v;
262     int16x4x2_t wgt0;
263     WORD16 *pi2_src_tmp;
264     UWORD8 *pu1_dst_tmp;
265 
266     WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
267     tmp_lvl_shift += (1 << (shift - 1));
268     tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
269 
270     tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
271     tmp_lvl_shift += (1 << (shift - 1));
272     tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
273 
274     tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
275 
276     tmp_shift_t = vmovq_n_s32(tmp_shift);
277 
278     tmp_wgt0_u = vdup_n_s16(wgt0_cb);
279     tmp_wgt0_v = vdup_n_s16(wgt0_cr);
280     wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
281 
282     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
283     /* height has also been unrolled, hence 2 rows will processed at a time                     */
284     /* store also has been taken care for two row process                                       */
285     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
286     /* saturated and narrowed                                                                   */
287 
288     for(row = ht; row > 0; row -= 2)
289     {
290         for(col = 2 * wd; col > 0; col -= 4)
291         {
292             pi2_src_tmp = pi2_src + src_strd;
293 
294             pu1_dst_tmp = pu1_dst + dst_strd;
295 
296             pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
297             pi2_src += 4;
298 
299             pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
300             i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);
301 
302             i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
303             i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);
304 
305             sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
306             i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);
307 
308             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
309             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
310 
311             sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
312             sto_res = vqmovn_u16(sto_res_tmp3);
313 
314             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
315             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
316 
317             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
318             pu1_dst += 4;
319 
320             sto_res = vqmovn_u16(sto_res_tmp3);
321             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
322         }
323         pi2_src += 2 * src_strd - 2 * wd;
324         pu1_dst += 2 * dst_strd - 2 * wd;
325     }
326 }
327 //WEIGHTED_PRED_CHROMA_UNI
328 
329 /**
330 *******************************************************************************
331 *
332 * @brief
333 *  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
334 * pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
335 * function is optimized considering the fact Width and  height are multiple
336 * of 2.
337 *
338 * @par Description:
339 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
340 * off1 + 1) << (shift - 1) ) >> shift
341 *
342 * @param[in] pi2_src1
343 *  Pointer to source 1
344 *
345 * @param[in] pi2_src2
346 *  Pointer to source 2
347 *
348 * @param[out] pu1_dst
349 *  Pointer to destination
350 *
351 * @param[in] src_strd1
352 *  Source stride 1
353 *
354 * @param[in] src_strd2
355 *  Source stride 2
356 *
357 * @param[in] dst_strd
358 *  Destination stride
359 *
360 * @param[in] wgt0
361 *  weight to be multiplied to source 1
362 *
363 * @param[in] off0
364 *  offset 0
365 *
366 * @param[in] wgt1
367 *  weight to be multiplied to source 2
368 *
369 * @param[in] off1
370 *  offset 1
371 *
372 * @param[in] shift
373 *  (14 Bit depth) + log2_weight_denominator
374 *
375 * @param[in] lvl_shift1
376 *  added before shift and offset
377 *
378 * @param[in] lvl_shift2
379 *  added before shift and offset
380 *
381 * @param[in] ht
382 *  height of the source
383 *
384 * @param[in] wd
385 *  width of the source
386 *
387 * @returns
388 *
389 * @remarks
390 *  None
391 *
392 *******************************************************************************
393 */
394 
ihevc_weighted_pred_bi_neonintr(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 wgt1,WORD32 off1,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)395 void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
396                                      WORD16 *pi2_src2,
397                                      UWORD8 *pu1_dst,
398                                      WORD32 src_strd1,
399                                      WORD32 src_strd2,
400                                      WORD32 dst_strd,
401                                      WORD32 wgt0,
402                                      WORD32 off0,
403                                      WORD32 wgt1,
404                                      WORD32 off1,
405                                      WORD32 shift,
406                                      WORD32 lvl_shift1,
407                                      WORD32 lvl_shift2,
408                                      WORD32 ht,
409                                      WORD32 wd)
410 {
411     WORD32 row, col;
412     int16x4_t pi2_src1_val1;
413     int16x4_t pi2_src1_val2;
414     int16x4_t pi2_src2_val1;
415     int16x4_t pi2_src2_val2;
416     int32x4_t i4_tmp1_t1;
417     int32x4_t i4_tmp1_t2;
418     int32x4_t i4_tmp2_t1;
419     int32x4_t i4_tmp2_t2;
420     int32x4_t sto_res_tmp1;
421     uint16x4_t sto_res_tmp2;
422     uint16x8_t sto_res_tmp3;
423     uint8x8_t sto_res;
424     int32x4_t tmp_lvl_shift_t;
425     WORD32 tmp_shift = 0 - shift;
426     int32x4_t tmp_shift_t;
427     WORD16 *pi2_src_tmp1;
428     WORD16 *pi2_src_tmp2;
429     UWORD8 *pu1_dst_tmp;
430 
431     WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
432     tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
433     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
434     tmp_shift_t = vmovq_n_s32(tmp_shift);
435 
436     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
437     /* height has also been unrolled, hence 2 rows will processed at a time                     */
438     /* store also has been taken care for two row process                                       */
439     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
440     /* saturated and narrowed                                                                   */
441 
442     for(row = ht; row > 0; row -= 2)
443     {
444         for(col = wd; col > 0; col -= 4)
445         {
446             pi2_src_tmp1 = pi2_src1 + src_strd1;
447             pi2_src_tmp2 = pi2_src2 + src_strd2;
448 
449             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
450             pi2_src1 += 4;
451             pu1_dst_tmp = pu1_dst + dst_strd;
452 
453             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
454             pi2_src2 += 4;
455             i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);
456 
457             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
458             i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);
459 
460             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
461             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
462 
463             i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
464             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
465 
466             i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
467             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
468 
469             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
470             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
471 
472             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
473             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
474 
475             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
476             sto_res = vqmovn_u16(sto_res_tmp3);
477 
478             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
479             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
480 
481             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
482             pu1_dst += 4;
483 
484             sto_res = vqmovn_u16(sto_res_tmp3);
485             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
486         }
487         pi2_src1 += 2 * src_strd1 - wd;
488         pi2_src2 += 2 * src_strd2 - wd;
489         pu1_dst += 2 * dst_strd - wd;
490     }
491 }
492 //WEIGHTED_PRED_BI
493 
494 /**
495 *******************************************************************************
496 *
497 * @brief
498 *  Chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
499 * pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
500 * function is optimized considering the fact Width and  height are multiple
501 * of 2.
502 *
503 * @par Description:
504 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
505 * off1 + 1) << (shift - 1) ) >> shift
506 *
507 * @param[in] pi2_src1
508 *  Pointer to source 1
509 *
510 * @param[in] pi2_src2
511 *  Pointer to source 2
512 *
513 * @param[out] pu1_dst
514 *  Pointer to destination
515 *
516 * @param[in] src_strd1
517 *  Source stride 1
518 *
519 * @param[in] src_strd2
520 *  Source stride 2
521 *
522 * @param[in] dst_strd
523 *  Destination stride
524 *
525 * @param[in] wgt0
526 *  weight to be multiplied to source 1
527 *
528 * @param[in] off0
529 *  offset 0
530 *
531 * @param[in] wgt1
532 *  weight to be multiplied to source 2
533 *
534 * @param[in] off1
535 *  offset 1
536 *
537 * @param[in] shift
538 *  (14 Bit depth) + log2_weight_denominator
539 *
540 * @param[in] lvl_shift1
541 *  added before shift and offset
542 *
543 * @param[in] lvl_shift2
544 *  added before shift and offset
545 *
546 * @param[in] ht
547 *  height of the source
548 *
549 * @param[in] wd
550 *  width of the source
551 *
552 * @returns
553 *
554 * @remarks
555 *  None
556 *
557 *******************************************************************************
558 */
559 
ihevc_weighted_pred_chroma_bi_neonintr(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 wgt1_cb,WORD32 wgt1_cr,WORD32 off1_cb,WORD32 off1_cr,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)560 void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
561                                             WORD16 *pi2_src2,
562                                             UWORD8 *pu1_dst,
563                                             WORD32 src_strd1,
564                                             WORD32 src_strd2,
565                                             WORD32 dst_strd,
566                                             WORD32 wgt0_cb,
567                                             WORD32 wgt0_cr,
568                                             WORD32 off0_cb,
569                                             WORD32 off0_cr,
570                                             WORD32 wgt1_cb,
571                                             WORD32 wgt1_cr,
572                                             WORD32 off1_cb,
573                                             WORD32 off1_cr,
574                                             WORD32 shift,
575                                             WORD32 lvl_shift1,
576                                             WORD32 lvl_shift2,
577                                             WORD32 ht,
578                                             WORD32 wd)
579 {
580     WORD32 row, col;
581     int16x4_t pi2_src1_val1;
582     int16x4_t pi2_src1_val2;
583     int16x4_t pi2_src2_val1;
584     int16x4_t pi2_src2_val2;
585     int32x4_t i4_tmp1_t1;
586     int32x4_t i4_tmp1_t2;
587     int32x4_t i4_tmp2_t1;
588     int32x4_t i4_tmp2_t2;
589     int32x4_t sto_res_tmp1;
590     uint16x4_t sto_res_tmp2;
591     uint16x8_t sto_res_tmp3;
592     uint8x8_t sto_res;
593     int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
594     int32x4x2_t tmp_lvl_shift_t;
595     WORD32 tmp_shift = 0 - shift;
596     int32x4_t tmp_shift_t;
597     int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
598     int16x4x2_t wgt0, wgt1;
599     WORD16 *pi2_src_tmp1;
600     WORD16 *pi2_src_tmp2;
601     UWORD8 *pu1_dst_tmp;
602 
603     WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
604     tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
605     tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
606 
607     tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
608     tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
609     tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
610 
611     tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
612 
613     tmp_shift_t = vmovq_n_s32(tmp_shift);
614 
615     tmp_wgt0_u = vdup_n_s16(wgt0_cb);
616     tmp_wgt0_v = vdup_n_s16(wgt0_cr);
617     wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
618     tmp_wgt1_u = vdup_n_s16(wgt1_cb);
619     tmp_wgt1_v = vdup_n_s16(wgt1_cr);
620     wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);
621 
622     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
623     /* height has also been unrolled, hence 2 rows will processed at a time                     */
624     /* store also has been taken care for two row process                                       */
625     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
626     /* saturated and narrowed                                                                   */
627 
628     for(row = ht; row > 0; row -= 2)
629     {
630         for(col = 2 * wd; col > 0; col -= 4)
631         {
632             pi2_src_tmp1 = pi2_src1 + src_strd1;
633             pi2_src_tmp2 = pi2_src2 + src_strd2;
634 
635             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
636             pi2_src1 += 4;
637             pu1_dst_tmp = pu1_dst + dst_strd;
638 
639             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
640             pi2_src2 += 4;
641             i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);
642 
643             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
644             i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);
645 
646             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
647             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
648 
649             i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
650             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);
651 
652             i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
653             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
654 
655             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
656             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
657 
658             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
659             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
660 
661             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
662             sto_res = vqmovn_u16(sto_res_tmp3);
663 
664             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
665             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
666 
667             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
668             pu1_dst += 4;
669 
670             sto_res = vqmovn_u16(sto_res_tmp3);
671             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
672         }
673         pi2_src1 += 2 * src_strd1 - 2 * wd;
674         pi2_src2 += 2 * src_strd2 - 2 * wd;
675         pu1_dst += 2 * dst_strd - 2 * wd;
676     }
677 }
678 //WEIGHTED_PRED_CHROMA_BI
679 
680 /**
681 *******************************************************************************
682 *
683 * @brief
684 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
685 * pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
686 * function is optimized considering the fact Width and  height are multiple
687 * of 2.
688 *
689 * @par Description:
690 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
691 * >> shift  where shift = 15 - BitDepth
692 *
693 * @param[in] pi2_src1
694 *  Pointer to source 1
695 *
696 * @param[in] pi2_src2
697 *  Pointer to source 2
698 *
699 * @param[out] pu1_dst
700 *  Pointer to destination
701 *
702 * @param[in] src_strd1
703 *  Source stride 1
704 *
705 * @param[in] src_strd2
706 *  Source stride 2
707 *
708 * @param[in] dst_strd
709 *  Destination stride
710 *
711 * @param[in] lvl_shift1
712 *  added before shift and offset
713 *
714 * @param[in] lvl_shift2
715 *  added before shift and offset
716 *
717 * @param[in] ht
718 *  height of the source
719 *
720 * @param[in] wd
721 *  width of the source
722 *
723 * @returns
724 *
725 * @remarks
726 *  None
727 *
728 *******************************************************************************
729 */
730 
ihevc_weighted_pred_bi_default_neonintr(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)731 void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
732                                              WORD16 *pi2_src2,
733                                              UWORD8 *pu1_dst,
734                                              WORD32 src_strd1,
735                                              WORD32 src_strd2,
736                                              WORD32 dst_strd,
737                                              WORD32 lvl_shift1,
738                                              WORD32 lvl_shift2,
739                                              WORD32 ht,
740                                              WORD32 wd)
741 {
742     WORD32 row, col;
743     int16x4_t pi2_src1_val1;
744     int16x4_t pi2_src1_val2;
745     int16x4_t pi2_src2_val1;
746     int16x4_t pi2_src2_val2;
747     int32x4_t i4_tmp1_t1;
748     int32x4_t i4_tmp1_t2;
749     int32x4_t i4_tmp2_t1;
750     int32x4_t i4_tmp2_t2;
751     int32x4_t sto_res_tmp1;
752     uint16x4_t sto_res_tmp2;
753     uint16x8_t sto_res_tmp3;
754     uint8x8_t sto_res;
755     int32x4_t tmp_lvl_shift_t;
756     int32x4_t tmp_shift_t;
757     WORD16 *pi2_src_tmp1;
758     WORD16 *pi2_src_tmp2;
759     UWORD8 *pu1_dst_tmp;
760     WORD32 shift;
761 
762     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
763     WORD32 tmp_shift = 0 - shift;
764     WORD32 tmp_lvl_shift = 1 << (shift - 1);
765     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
766     tmp_shift_t = vmovq_n_s32(tmp_shift);
767 
768     int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
769     int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
770 
771     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
772     /* height has also been unrolled, hence 2 rows will processed at a time                     */
773     /* store also has been taken care for two row process                                       */
774     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
775     /* saturated and narrowed                                                                   */
776 
777     for(row = ht; row > 0; row -= 2)
778     {
779         for(col = wd; col > 0; col -= 4)
780         {
781             pi2_src_tmp1 = pi2_src1 + src_strd1;
782             pi2_src_tmp2 = pi2_src2 + src_strd2;
783 
784             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
785             pi2_src1 += 4;
786             pu1_dst_tmp = pu1_dst + dst_strd;
787 
788             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
789             pi2_src2 += 4;
790             i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
791 
792             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
793             i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
794 
795             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
796             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
797 
798             i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
799             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
800 
801             i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
802             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
803 
804             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
805             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
806 
807             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
808             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
809 
810             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
811             sto_res = vqmovn_u16(sto_res_tmp3);
812 
813             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
814             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
815 
816             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
817             pu1_dst += 4;
818 
819             sto_res = vqmovn_u16(sto_res_tmp3);
820             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
821         }
822         pi2_src1 += 2 * src_strd1 - wd;
823         pi2_src2 += 2 * src_strd2 - wd;
824         pu1_dst += 2 * dst_strd - wd;
825     }
826 }
827 //WEIGHTED_PRED_BI_DEFAULT
828 
829 /**
830 *******************************************************************************
831 *
832 * @brief
833 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
834 * pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
835 * function is optimized considering the fact Width and  height are multiple
836 * of 2.
837 *
838 * @par Description:
839 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
840 * >> shift  where shift = 15 - BitDepth
841 *
842 * @param[in] pi2_src1
843 *  Pointer to source 1
844 *
845 * @param[in] pi2_src2
846 *  Pointer to source 2
847 *
848 * @param[out] pu1_dst
849 *  Pointer to destination
850 *
851 * @param[in] src_strd1
852 *  Source stride 1
853 *
854 * @param[in] src_strd2
855 *  Source stride 2
856 *
857 * @param[in] dst_strd
858 *  Destination stride
859 *
860 * @param[in] lvl_shift1
861 *  added before shift and offset
862 *
863 * @param[in] lvl_shift2
864 *  added before shift and offset
865 *
866 * @param[in] ht
867 *  height of the source
868 *
869 * @param[in] wd
870 *  width of the source
871 *
872 * @returns
873 *
874 * @remarks
875 *  None
876 *
877 *******************************************************************************
878 */
879 
ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)880 void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
881                                                     WORD16 *pi2_src2,
882                                                     UWORD8 *pu1_dst,
883                                                     WORD32 src_strd1,
884                                                     WORD32 src_strd2,
885                                                     WORD32 dst_strd,
886                                                     WORD32 lvl_shift1,
887                                                     WORD32 lvl_shift2,
888                                                     WORD32 ht,
889                                                     WORD32 wd)
890 {
891     WORD32 row, col;
892     int16x4_t pi2_src1_val1;
893     int16x4_t pi2_src1_val2;
894     int16x4_t pi2_src2_val1;
895     int16x4_t pi2_src2_val2;
896     int32x4_t i4_tmp1_t1;
897     int32x4_t i4_tmp1_t2;
898     int32x4_t i4_tmp2_t1;
899     int32x4_t i4_tmp2_t2;
900     int32x4_t sto_res_tmp1;
901     uint16x4_t sto_res_tmp2;
902     uint16x8_t sto_res_tmp3;
903     uint8x8_t sto_res;
904     int32x4_t tmp_lvl_shift_t;
905     int32x4_t tmp_shift_t;
906     WORD16 *pi2_src_tmp1;
907     WORD16 *pi2_src_tmp2;
908     UWORD8 *pu1_dst_tmp;
909     WORD32 shift;
910     WORD32 tmp_shift;
911     WORD32 tmp_lvl_shift;
912     int16x4_t lvl_shift1_t;
913     int16x4_t lvl_shift2_t;
914     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
915     tmp_shift = 0 - shift;
916     tmp_lvl_shift = 1 << (shift - 1);
917     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
918     tmp_shift_t = vmovq_n_s32(tmp_shift);
919 
920     lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
921     lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
922 
923     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
924     /* height has also been unrolled, hence 2 rows will processed at a time                     */
925     /* store also has been taken care for two row process                                       */
926     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
927     /* saturated and narrowed                                                                   */
928 
929     for(row = ht; row > 0; row -= 2)
930     {
931         for(col = 2 * wd; col > 0; col -= 4)
932         {
933             pi2_src_tmp1 = pi2_src1 + src_strd1;
934             pi2_src_tmp2 = pi2_src2 + src_strd2;
935 
936             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
937             pi2_src1 += 4;
938             pu1_dst_tmp = pu1_dst + dst_strd;
939 
940             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
941             pi2_src2 += 4;
942             i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
943 
944             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
945             i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
946 
947             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
948             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
949 
950             i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
951             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
952 
953             i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
954             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
955 
956             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
957             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
958 
959             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
960             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
961 
962             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
963             sto_res = vqmovn_u16(sto_res_tmp3);
964 
965             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
966             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
967 
968             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
969             pu1_dst += 4;
970 
971             sto_res = vqmovn_u16(sto_res_tmp3);
972             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
973         }
974         pi2_src1 += 2 * src_strd1 - 2 * wd;
975         pi2_src2 += 2 * src_strd2 - 2 * wd;
976         pu1_dst += 2 * dst_strd - 2 * wd;
977     }
978 }
979 //WEIGHTED_PRED_CHROMA_BI_DEFAULT
980