1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevc_intra_pred_filters_neon_intr.c
22 *
23 * @brief
24 *  Contains function Definition for intra prediction  interpolation filters
25 *
26 *
27 * @author
28 *  Yogeswaran RS
29 *
30 * @par List of Functions:
31 *  - ihevc_intra_pred_luma_planar()
32 *  - ihevc_intra_pred_luma_dc()
33 *  - ihevc_intra_pred_luma_horz()
34 *  - ihevc_intra_pred_luma_ver()
35 *  - ihevc_intra_pred_luma_mode2()
36 *  - ihevc_intra_pred_luma_mode_18_34()
37 *
38 * @remarks
39 *  None
40 *
41 *******************************************************************************
42 */
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 #include <stdio.h>
47 
48 #include "ihevc_typedefs.h"
49 #include "ihevc_intra_pred.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_func_selector.h"
52 #include "arm_neon.h"
53 #include "ihevc_platform_macros.h"
54 #include "ihevc_common_tables.h"
55 
56 /****************************************************************************/
57 /* Constant Macros                                                          */
58 /****************************************************************************/
59 #define MAX_CU_SIZE 64
60 #define BIT_DEPTH 8
61 #define T32_4NT 128
62 #define T16_4NT 64
63 
64 
65 
66 /*****************************************************************************/
67 /* Table Look-up                                                             */
68 /*****************************************************************************/
69 
70 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
71 
72 /*****************************************************************************/
73 /* Function Definition                                                      */
74 /*****************************************************************************/
75 
76 /**
77 *******************************************************************************
78 *
79 * @brief
80  *    Intra prediction interpolation filter for pu1_ref substitution
81  *
82  *
83  * @par Description:
84  *    Reference substitution process for samples unavailable  for prediction
85  *    Refer to section 8.4.4.2.2
86  *
87  * @param[in] pu1_top_left
88  *  UWORD8 pointer to the top-left
89  *
90  * @param[in] pu1_top
91  *  UWORD8 pointer to the top
92  *
93  * @param[in] pu1_left
94  *  UWORD8 pointer to the left
95  *
96  * @param[in] src_strd
97  *  WORD32 Source stride
98  *
99  * @param[in] nbr_flags
100  *  WORD32 neighbor availability flags
101  *
102  * @param[in] nt
103  *  WORD32 transform Block size
104  *
105  * @param[in] dst_strd
106  *  WORD32 Destination stride
107  *
108  * @returns
109  *
110  * @remarks
111  *  None
112  *
113  *******************************************************************************
114  */
115 
116 
ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 * pu1_top_left,UWORD8 * pu1_top,UWORD8 * pu1_left,WORD32 src_strd,WORD32 nt,WORD32 nbr_flags,UWORD8 * pu1_dst,WORD32 dst_strd)117 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
118                                                      UWORD8 *pu1_top,
119                                                      UWORD8 *pu1_left,
120                                                      WORD32 src_strd,
121                                                      WORD32 nt,
122                                                      WORD32 nbr_flags,
123                                                      UWORD8 *pu1_dst,
124                                                      WORD32 dst_strd)
125 {
126     UWORD8 pu1_ref;
127     WORD32 dc_val, i;
128     WORD32 total_samples = (4 * nt) + 1;
129     WORD32 two_nt = 2 * nt;
130     WORD32 three_nt = 3 * nt;
131     WORD32 get_bits;
132     WORD32 next;
133     WORD32 bot_left, left, top, tp_right, tp_left;
134     WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
135     UNUSED(dst_strd);
136     dc_val = 1 << (BIT_DEPTH - 1);
137 
138     /* Neighbor Flag Structure*/
139     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
140               1         4         4     4         4
141      */
142 
143     /* If no neighbor flags are present, fill the neighbor samples with DC value */
144     if(nbr_flags == 0)
145     {
146         for(i = 0; i < total_samples; i++)
147         {
148             pu1_dst[i] = dc_val;
149         }
150     }
151     else
152     {
153         /* Else fill the corresponding samples */
154         pu1_dst[two_nt] = *pu1_top_left;
155         UWORD8 *pu1_dst_tmp2 = pu1_dst;
156         UWORD8 *pu1_top_tmp = pu1_top;
157         pu1_dst_tmp2 += two_nt + 1;
158 
159         for(i = 0; i < two_nt; i++)
160             pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
161 
162         uint8x8_t src;
163         for(i = two_nt; i > 0; i -= 8)
164         {
165             src = vld1_u8(pu1_top_tmp);
166             pu1_top_tmp += 8;
167             vst1_u8(pu1_dst_tmp2, src);
168             pu1_dst_tmp2 += 8;
169         }
170 
171         if(nt <= 8)
172         {
173             /* 1 bit extraction for all the neighboring blocks */
174             tp_left = (nbr_flags & 0x10000) >> 16;
175             bot_left = nbr_flags & 0x1;
176             left = (nbr_flags & 0x10) >> 4;
177             top = (nbr_flags & 0x100) >> 8;
178             tp_right = (nbr_flags & 0x1000) >> 12;
179 
180             next = 1;
181 
182             /* If bottom -left is not available, reverse substitution process*/
183             if(bot_left == 0)
184             {
185                 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
186 
187                 /* Check for the 1st available sample from bottom-left*/
188                 while(!a_nbr_flag[next])
189                     next++;
190 
191                 /* If Left, top-left are available*/
192                 if(next <= 2)
193                 {
194                     idx = nt * next;
195                     pu1_ref = pu1_dst[idx];
196                     for(i = 0; i < idx; i++)
197                         pu1_dst[i] = pu1_ref;
198                 }
199                 else /* If top, top-right are available */
200                 {
201                     /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
202                     idx = (nt * (next - 1)) + 1;
203                     pu1_ref = pu1_dst[idx];
204                     for(i = 0; i < idx; i++)
205                         pu1_dst[i] = pu1_ref;
206                 }
207             }
208 
209             /* Forward Substitution Process */
210             /* If left is Unavailable, copy the last bottom-left value */
211 
212             if(left == 0)
213             {
214                 uint8x8_t dup_pu1_dst1;
215                 UWORD8 *pu1_dst_const_nt = pu1_dst;
216                 pu1_dst_const_nt += nt;
217 
218                 if(0 == (nt & 7))
219                 {
220                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
221                     for(i = nt; i > 0; i -= 8)
222                     {
223                         vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
224                         pu1_dst_const_nt += 8;
225 
226                     }
227                 }
228                 else
229                 {
230                     //uint32x2_t dup_pu1_dst4;
231                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
232                     //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
233                     for(i = nt; i > 0; i -= 4)
234                     {
235                         vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
236                         pu1_dst_const_nt += 4;
237 
238                     }
239 
240                 }
241 
242             }
243             if(tp_left == 0)
244                 pu1_dst[two_nt] = pu1_dst[two_nt - 1];
245             if(top == 0)
246             {
247 
248                 if(0 == (nt & 7))
249                 {
250                     uint8x8_t dup_pu1_dst2;
251                     UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
252                     pu1_dst_const_two_nt_1 += (two_nt + 1);
253                     dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
254                     for(i = nt; i > 0; i -= 8)
255                     {
256                         vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
257                         pu1_dst_const_two_nt_1 += 8;
258 
259                     }
260                 }
261                 else
262                 {
263                     for(i = 0; i < nt; i++)
264                         pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
265                 }
266             }
267             if(tp_right == 0)
268             {
269                 uint8x8_t dup_pu1_dst3;
270                 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
271                 pu1_dst_const_three_nt_1 += (three_nt + 1);
272                 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
273                 if(0 == (nt & 7))
274                 {
275                     for(i = nt; i > 0; i -= 8)
276                     {
277                         vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
278                         pu1_dst_const_three_nt_1 += 8;
279 
280                     }
281                 }
282                 else
283                 {
284                     for(i = nt; i > 0; i -= 4)
285                     {
286                         vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
287                         pu1_dst_const_three_nt_1 += 4;
288                     }
289 
290                 }
291 
292             }
293         }
294         if(nt == 16)
295         {
296             WORD32 nbr_flags_temp = 0;
297             nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
298                             + ((nbr_flags & 0x300) >> 4)
299                             + ((nbr_flags & 0x3000) >> 6)
300                             + ((nbr_flags & 0x10000) >> 8);
301 
302             /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
303             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
304             {
305                 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
306 
307                 if(nbr_id_from_bl == 64)
308                     nbr_id_from_bl = 32;
309 
310                 if(nbr_id_from_bl == 32)
311                 {
312                     /* for top left : 1 pel per nbr bit */
313                     if(!((nbr_flags_temp >> 8) & 0x1))
314                     {
315                         nbr_id_from_bl++;
316                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
317                     }
318                 }
319                 /* Reverse Substitution Process*/
320                 if(nbr_id_from_bl)
321                 {
322                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
323                     pu1_ref = pu1_dst[nbr_id_from_bl];
324                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
325                     {
326                         pu1_dst[i] = pu1_ref;
327                     }
328                 }
329             }
330 
331             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
332             while(nbr_id_from_bl < ((T16_4NT) + 1))
333             {
334                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
335                 /* Devide by 8 to obtain the original index */
336                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
337 
338                 /* The Top-left flag is at the last bit location of nbr_flags*/
339                 if(nbr_id_from_bl == (T16_4NT / 2))
340                 {
341                     get_bits = GET_BITS(nbr_flags_temp, 8);
342 
343                     /* only pel substitution for TL */
344                     if(!get_bits)
345                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
346                 }
347                 else
348                 {
349                     get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
350                     if(!get_bits)
351                     {
352                         /* 8 pel substitution (other than TL) */
353                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
354                         for(i = 0; i < 8; i++)
355                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
356                     }
357 
358                 }
359                 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
360             }
361         }
362 
363         if(nt == 32)
364         {
365             /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
366             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
367             {
368                 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
369 
370                 if(nbr_id_from_bl == 64)
371                 {
372                     /* for top left : 1 pel per nbr bit */
373                     if(!((nbr_flags >> 16) & 0x1))
374                     {
375                         /* top left not available */
376                         nbr_id_from_bl++;
377                         /* top and top right;  8 pels per nbr bit */
378                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
379                     }
380                 }
381                 /* Reverse Substitution Process*/
382                 if(nbr_id_from_bl)
383                 {
384                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
385                     pu1_ref = pu1_dst[nbr_id_from_bl];
386                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
387                         pu1_dst[i] = pu1_ref;
388                 }
389             }
390 
391             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
392             while(nbr_id_from_bl < ((T32_4NT)+1))
393             {
394                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
395                 /* Devide by 8 to obtain the original index */
396                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
397 
398                 /* The Top-left flag is at the last bit location of nbr_flags*/
399                 if(nbr_id_from_bl == (T32_4NT / 2))
400                 {
401                     get_bits = GET_BITS(nbr_flags, 16);
402                     /* only pel substitution for TL */
403                     if(!get_bits)
404                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
405                 }
406                 else
407                 {
408                     get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
409                     if(!get_bits)
410                     {
411                         /* 8 pel substitution (other than TL) */
412                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
413                         for(i = 0; i < 8; i++)
414                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
415                     }
416 
417                 }
418                 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
419             }
420         }
421 
422     }
423 
424 }
425 
426 /**
427  *******************************************************************************
428  *
429  * @brief
430  *    Intra prediction interpolation filter for ref_filtering
431  *
432  *
433  * @par Description:
434  *    Reference DC filtering for neighboring samples dependent  on TU size and
435  *    mode  Refer to section 8.4.4.2.3 in the standard
436  *
437  * @param[in] pu1_src
438  *  UWORD8 pointer to the source
439  *
440  * @param[out] pu1_dst
441  *  UWORD8 pointer to the destination
442  *
443  * @param[in] nt
444  *  integer Transform Block size
445  *
446  * @param[in] mode
447  *  integer intraprediction mode
448  *
449  * @returns
450  *
451  * @remarks
452  *  None
453  *
454  *******************************************************************************
455  */
456 
457 
ihevc_intra_pred_ref_filtering_neonintr(UWORD8 * pu1_src,WORD32 nt,UWORD8 * pu1_dst,WORD32 mode,WORD32 strong_intra_smoothing_enable_flag)458 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
459                                              WORD32 nt,
460                                              UWORD8 *pu1_dst,
461                                              WORD32 mode,
462                                              WORD32 strong_intra_smoothing_enable_flag)
463 {
464     WORD32 filter_flag;
465     WORD32 i = 0;
466     WORD32 four_nt = 4 * nt;
467 
468     WORD32 src_4nt;
469     WORD32 src_0nt;
470     /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1   */
471     /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values        */
472     UWORD8 *pu1_src_tmp_0 = pu1_src;
473     UWORD8 *pu1_src_tmp_1;
474     UWORD8 *pu1_src_tmp_2;
475     UWORD8 *pu1_dst_tmp_0 = pu1_dst;
476     UWORD8 *pu1_dst_tmp_1;
477 
478     uint8x8_t src_val_0, src_val_2;
479     uint8x8_t src_val_1, shift_res;
480     uint8x8_t dup_const_2;
481     uint16x8_t mul_res, add_res;
482     WORD32 bi_linear_int_flag = 0;
483     WORD32 abs_cond_left_flag = 0;
484     WORD32 abs_cond_top_flag = 0;
485     WORD32 dc_val = 1 << (BIT_DEPTH - 5);
486     shift_res = vdup_n_u8(0);
487 
488     filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
489 
490     if(0 == filter_flag)
491     {
492         if(pu1_src == pu1_dst)
493         {
494             return;
495         }
496         else
497         {
498             for(i = four_nt; i > 0; i -= 8)
499             {
500                 src_val_0 = vld1_u8(pu1_src_tmp_0);
501                 pu1_src_tmp_0 += 8;
502                 vst1_u8(pu1_dst_tmp_0, src_val_0);
503                 pu1_dst_tmp_0 += 8;
504             }
505             pu1_dst[four_nt] = pu1_src[four_nt];
506         }
507     }
508 
509     else
510     {
511         /* If strong intra smoothin is enabled and transform size is 32 */
512         if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
513         {
514             /*Strong Intra Filtering*/
515             abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
516                             - (2 * pu1_src[3 * nt]))) < dc_val;
517             abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
518                             - (2 * pu1_src[nt]))) < dc_val;
519 
520             bi_linear_int_flag = ((1 == abs_cond_left_flag)
521                             && (1 == abs_cond_top_flag));
522         }
523 
524         src_4nt = pu1_src[4 * nt];
525         src_0nt = pu1_src[0];
526         /* Strong filtering of reference samples */
527         if(1 == bi_linear_int_flag)
528         {
529             WORD32 two_nt = four_nt >> 1;
530 
531             WORD32 pu1_src_0_val = pu1_src[0];
532             WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
533             WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
534 
535             WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
536             uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
537 
538             WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
539             uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
540 
541             const UWORD8 *const_col_i;
542             uint8x8_t const_col_i_val;
543             uint16x8_t prod_val_1;
544             uint16x8_t prod_val_2;
545             uint16x8_t prod_val_3;
546             uint16x8_t prod_val_4;
547             uint8x8_t res_val_1;
548             uint8x8_t res_val_2;
549             uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
550             uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
551             uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
552             pu1_dst_tmp_0 = pu1_dst + 1;
553             pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
554 
555             const_col_i = gau1_ihevc_planar_factor + 1;
556 
557             for(i = two_nt; i > 0; i -= 8)
558             {
559                 const_col_i_val = vld1_u8(const_col_i);
560                 const_col_i += 8;
561 
562                 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
563                 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
564 
565                 res_val_1 = vrshrn_n_u16(prod_val_2, 6);
566                 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
567 
568                 vst1_u8(pu1_dst_tmp_0, res_val_1);
569                 pu1_dst_tmp_0 += 8;
570                 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
571 
572                 res_val_2 = vrshrn_n_u16(prod_val_4, 6);
573                 vst1_u8(pu1_dst_tmp_1, res_val_2);
574                 pu1_dst_tmp_1 += 8;
575             }
576             pu1_dst[2 * nt] = pu1_src[2 * nt];
577         }
578         else
579         {
580             pu1_src_tmp_1 = pu1_src + 1;
581             pu1_src_tmp_2 = pu1_src + 2;
582             pu1_dst_tmp_0 += 1;
583 
584             dup_const_2 = vdup_n_u8(2);
585 
586             /* Extremities Untouched*/
587             pu1_dst[0] = pu1_src[0];
588 
589             /* To avoid the issue when the dest and src has the same pointer this load has been done
590              * outside and the 2nd consecutive load is done before the store of the 1st */
591 
592             /* Perform bilinear filtering of Reference Samples */
593             for(i = (four_nt - 1); i > 0; i -= 8)
594             {
595                 src_val_0 = vld1_u8(pu1_src_tmp_0);
596                 pu1_src_tmp_0 += 8;
597 
598                 src_val_2 = vld1_u8(pu1_src_tmp_2);
599                 pu1_src_tmp_2 += 8;
600 
601                 src_val_1 = vld1_u8(pu1_src_tmp_1);
602                 pu1_src_tmp_1 += 8;
603 
604                 if(i < four_nt - 1)
605                 {
606                     vst1_u8(pu1_dst_tmp_0, shift_res);
607                     pu1_dst_tmp_0 += 8;
608                 }
609 
610                 add_res = vaddl_u8(src_val_0, src_val_2);
611 
612                 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
613                 shift_res = vrshrn_n_u16(mul_res, 2);
614 
615             }
616             vst1_u8(pu1_dst_tmp_0, shift_res);
617             pu1_dst_tmp_0 += 8;
618         }
619         pu1_dst[4 * nt] = src_4nt;
620         pu1_dst[0] = src_0nt;
621     }
622 
623 }
624 
625 
626 
627 /**
628  *******************************************************************************
629  *
630  * @brief
631 *   Intra prediction interpolation filter for luma planar
632 *
633 * @par Description:
634 *      Planar Intraprediction with reference neighboring samples  location
635 *      pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'
636 *
637 * @param[in] pu1_src
638 *  UWORD8 pointer to the source
639 *
640 * @param[out] pu1_dst
641 *  UWORD8 pointer to the destination
642 *
643 * @param[in] src_strd
644 *  integer source stride
645 *
646 * @param[in] dst_strd
647 *  integer destination stride
648 *
649 * @param[in] nt
650 *  integer Transform Block size
651 *
652 * @param[in] wd
653 *  integer width of the array
654 *
655 * @returns
656 *
657 * @remarks
658 *  None
659 *
660 *******************************************************************************
661 */
662 
ihevc_intra_pred_luma_planar_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)663 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
664                                            WORD32 src_strd,
665                                            UWORD8 *pu1_dst,
666                                            WORD32 dst_strd,
667                                            WORD32 nt,
668                                            WORD32 mode)
669 {
670     /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor)   */
671     /* load const_nt_1_col values into a d register                                                 */
672     /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1                                         */
673     /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup                 */
674     /* log2nt + 1 is taken care while assigning the values itself                                   */
675     /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
676 
677     WORD32 row, col = 0;
678     WORD32 log2nt_plus1 = 6;
679     WORD32 two_nt, three_nt;
680     UWORD8 *pu1_ref_two_nt_1;
681     UWORD8 *pu1_dst_tmp;
682     const UWORD8 *const_nt_1_col;
683     uint8x8_t const_nt_1_col_t;
684     const UWORD8 *const_col_1;
685     uint8x8_t const_col_1_t;
686     uint8_t const_nt_1_row;
687     uint8x8_t const_nt_1_row_dup;
688     uint8_t const_row_1;
689     uint8x8_t const_row_1_dup;
690     uint8_t const_nt = nt;
691     uint16x8_t const_nt_dup;
692     uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
693     uint8x8_t pu1_ref_nt_1_dup;
694     uint8_t pu1_ref_two_nt_1_row;
695     uint8_t pu1_ref_three_nt_1;
696     uint8x8_t pu1_ref_two_nt_1_row_dup;
697     uint8x8_t pu1_ref_two_nt_1_t;
698     uint8x8_t pu1_ref_three_nt_1_dup;
699     uint16x8_t prod_t1;
700     uint16x8_t prod_t2;
701     uint16x8_t sto_res_tmp;
702     uint8x8_t sto_res;
703     int16x8_t log2nt_dup;
704     UNUSED(src_strd);
705     UNUSED(mode);
706     log2nt_plus1 = 32 - CLZ(nt);
707     two_nt = 2 * nt;
708     three_nt = 3 * nt;
709     /* loops have been unrolld considering the fact width is multiple of 8  */
710     if(0 == (nt & 7))
711     {
712         pu1_dst_tmp = pu1_dst;
713         const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
714 
715         const_col_1 = gau1_ihevc_planar_factor + 1;
716         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
717 
718         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
719         const_nt_dup = vdupq_n_u16(const_nt);
720 
721         log2nt_dup = vdupq_n_s16(log2nt_plus1);
722         log2nt_dup = vnegq_s16(log2nt_dup);
723 
724         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
725 
726         for(row = 0; row < nt; row++)
727         {
728             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
729             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
730 
731             const_nt_1_row = nt - 1 - row;
732             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
733 
734             const_row_1 = row + 1;
735             const_row_1_dup = vdup_n_u8(const_row_1);
736 
737             const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
738 
739             const_col_1 = gau1_ihevc_planar_factor + 1;
740             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
741 
742             for(col = nt; col > 0; col -= 8)
743             {
744                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
745                 const_nt_1_col -= 8;
746                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
747 
748                 const_col_1_t = vld1_u8(const_col_1);
749                 const_col_1 += 8;
750                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
751 
752                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
753                 pu1_ref_two_nt_1 += 8;
754                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
755 
756                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
757                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
758                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
759                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
760 
761                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
762                 sto_res = vmovn_u16(sto_res_tmp);
763                 vst1_u8(pu1_dst_tmp, sto_res);
764                 pu1_dst_tmp += 8;
765             }
766             pu1_dst_tmp += dst_strd - nt;
767         }
768     }
769     /* loops have been unrolld considering the fact width is multiple of 4  */
770     /* If column is multiple of 4 then height should be multiple of 2       */
771     else
772     {
773         uint8x8_t const_row_1_dup1;
774         uint8x8_t pu1_ref_two_nt_1_t1;
775         uint8x8_t const_nt_1_col_t1;
776         uint8x8_t const_col_1_t1;
777         uint8x8_t pu1_ref_two_nt_1_row_dup1;
778         uint8x8_t const_nt_1_row_dup1;
779 
780         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
781 
782         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
783         const_nt_dup = vdupq_n_u16(const_nt);
784 
785         log2nt_dup = vdupq_n_s16(log2nt_plus1);
786         log2nt_dup = vnegq_s16(log2nt_dup);
787 
788         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
789 
790         for(row = 0; row < nt; row += 2)
791         {
792             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
793             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
794             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
795             pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
796             pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
797 
798             const_nt_1_row = nt - 1 - row;
799             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
800             const_nt_1_row = nt - 2 - row;
801             const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
802             const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
803 
804             const_row_1 = row + 1;
805             const_row_1_dup = vdup_n_u8(const_row_1);
806             const_row_1 = row + 2;
807             const_row_1_dup1 = vdup_n_u8(const_row_1);
808             const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
809 
810             const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
811 
812             const_col_1 = gau1_ihevc_planar_factor + 1;
813 
814             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
815 
816             for(col = nt; col > 0; col -= 4)
817             {
818                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
819                 const_nt_1_col -= 4;
820                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
821 
822                 const_col_1_t = vld1_u8(const_col_1);
823                 const_col_1 += 4;
824                 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
825 
826                 pu1_dst_tmp = pu1_dst;
827                 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
828 
829                 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
830                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
831 
832                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
833                 pu1_ref_two_nt_1 += 4;
834                 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
835 
836                 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
837                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
838 
839                 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
840                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
841 
842                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
843                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
844                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
845 
846                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
847                 sto_res = vmovn_u16(sto_res_tmp);
848 
849                 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
850                 pu1_dst_tmp += dst_strd;
851 
852                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
853                 pu1_dst += 4;
854             }
855             pu1_dst += 2 * dst_strd - nt;
856         }
857     }
858 
859 }
860 /* INTRA_PRED_LUMA_PLANAR */
861 
862 /**
863 *******************************************************************************
864 *
865 * @brief
866 *    Intra prediction interpolation filter for luma dc
867 *
868 * @par Description:
869 *    Intraprediction for DC mode with reference neighboring  samples location
870 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
871 *
872 * @param[in] pu1_src
873 *  UWORD8 pointer to the source
874 *
875 * @param[out] pu1_dst
876 *  UWORD8 pointer to the destination
877 *
878 * @param[in] src_strd
879 *  integer source stride
880 *
881 * @param[in] dst_strd
882 *  integer destination stride
883 *
884 * @param[in] nt
885 *  integer Transform Block size
886 *
887 * @param[in] wd
888 *  integer width of the array
889 *
890 * @returns
891 *
892 * @remarks
893 *  None
894 *
895 *******************************************************************************
896 */
897 
ihevc_intra_pred_luma_dc_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)898 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
899                                        WORD32 src_strd,
900                                        UWORD8 *pu1_dst,
901                                        WORD32 dst_strd,
902                                        WORD32 nt,
903                                        WORD32 mode)
904 {
905     WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
906     WORD32 i = 0;
907     WORD32 row = 0, col = 0, col_count;
908     WORD32 log2nt_plus1 = 6;
909     WORD32 two_nt = 0;
910     uint16x8_t ref_load_q;
911     uint16x8_t three_dc_val_t;
912     uint8x8_t sto_res_tmp;
913     uint8x8_t sto_res_tmp1;
914     uint8x8_t sto_res_tmp2;
915     uint8x8_t sto_res_tmp3;
916     uint8x8_t sto_res_tmp4;
917     uint8x8_t dc_val_t;
918 
919     UWORD8 *pu1_ref_tmp;
920     UWORD8 *pu1_ref_tmp1;
921     UWORD8 *pu1_dst_tmp;
922     UWORD8 *pu1_dst_tmp1;
923     UWORD8 *pu1_dst_tmp2;
924     UNUSED(src_strd);
925     UNUSED(mode);
926 
927     /* log2nt + 1 is taken care while assigning the values itself.          */
928     log2nt_plus1 = 32 - CLZ(nt);
929 
930     /* loops have been unrolld considering the fact width is multiple of 8  */
931     if(0 == (nt & 7))
932     {
933         uint8x8_t ref_load1;
934         uint8x8_t ref_load2;
935         uint16x4_t acc_dc_pair1;
936         uint32x2_t acc_dc_pair2;
937         uint64x1_t acc_dc = vdup_n_u64(col);
938 
939         two_nt = 2 * nt;
940         pu1_ref_tmp = pu1_ref + nt;
941         pu1_ref_tmp1 = pu1_ref + two_nt + 1;
942 
943         for(i = two_nt; i > nt; i -= 8)
944         {
945             ref_load1 = vld1_u8(pu1_ref_tmp);
946             pu1_ref_tmp += 8;
947             acc_dc_pair1 = vpaddl_u8(ref_load1);
948 
949             ref_load2 = vld1_u8(pu1_ref_tmp1);
950             pu1_ref_tmp1 += 8;
951 
952             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
953             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
954 
955             acc_dc_pair1 = vpaddl_u8(ref_load2);
956             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
957             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
958         }
959 
960         dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
961         dc_val_t = vdup_n_u8(dc_val);
962         two_dc_val = 2 * dc_val;
963         three_dc_val = 3 * dc_val;
964         three_dc_val += 2;
965 
966         three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
967         pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
968         pu1_dst_tmp = pu1_dst;
969 
970 
971         if(nt == 32)
972         {
973             for(row = 0; row < nt; row++)
974             {
975                 for(col = nt; col > 0; col -= 8)
976                 {
977                     vst1_u8(pu1_dst_tmp, dc_val_t);
978                     pu1_dst_tmp += 8;
979                 }
980                 pu1_dst_tmp += dst_strd - nt;
981             }
982         }
983         else
984 
985         {
986             for(col = nt; col > 0; col -= 8)
987             {
988                 ref_load1 = vld1_u8(pu1_ref_tmp);
989                 pu1_ref_tmp += 8;
990                 ref_load_q = vmovl_u8(ref_load1);
991                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
992                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
993                 sto_res_tmp = vmovn_u16(ref_load_q);
994                 vst1_u8(pu1_dst_tmp, sto_res_tmp);
995                 pu1_dst_tmp += 8;
996             }
997 
998             pu1_ref_tmp = pu1_ref + two_nt - 9;
999             pu1_dst_tmp = pu1_dst + dst_strd;
1000             col_count = nt - 8;
1001 
1002             /* Except the first row the remaining rows are done here                            */
1003             /* Both column and row has been unrolled by 8                                       */
1004             /* Store has been taken care for the unrolling                                      */
1005             /* Except the 1st column of the remaining rows(other than 1st row), the values are  */
1006             /* constant hence it is extracted with an constant value and stored                 */
1007             /* If the column is greater than 8, then the remaining values are constant which is */
1008             /* taken care in the inner for loop                                                 */
1009 
1010             for(row = nt; row > 0; row -= 8)
1011             {
1012                 pu1_dst_tmp1 = pu1_dst_tmp + 8;
1013                 ref_load1 = vld1_u8(pu1_ref_tmp);
1014                 pu1_ref_tmp -= 8;
1015                 ref_load_q = vmovl_u8(ref_load1);
1016                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
1017                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
1018                 sto_res_tmp = vmovn_u16(ref_load_q);
1019 
1020                 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
1021 
1022                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
1023                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1024                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1025                 pu1_dst_tmp += dst_strd;
1026 
1027                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
1028                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1029                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1030                 pu1_dst_tmp += dst_strd;
1031 
1032                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
1033                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1034                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1035                 pu1_dst_tmp += dst_strd;
1036 
1037                 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
1038                 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
1039                 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1040                 pu1_dst_tmp += dst_strd;
1041 
1042                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
1043                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1044                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1045                 pu1_dst_tmp += dst_strd;
1046 
1047                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
1048                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1049                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1050                 pu1_dst_tmp += dst_strd;
1051 
1052                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
1053                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1054                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1055                 pu1_dst_tmp += dst_strd;
1056                 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1057                 if(row != 8)
1058                     vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1059                 pu1_dst_tmp += dst_strd;
1060 
1061                 for(col = col_count; col > 0; col -= 8)
1062                 {
1063                     pu1_dst_tmp2 = pu1_dst_tmp1;
1064                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1065                     pu1_dst_tmp1 += dst_strd;
1066                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1067                     pu1_dst_tmp1 += dst_strd;
1068                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1069                     pu1_dst_tmp1 += dst_strd;
1070                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1071                     pu1_dst_tmp1 += dst_strd;
1072                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1073                     pu1_dst_tmp1 += dst_strd;
1074                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1075                     pu1_dst_tmp1 += dst_strd;
1076                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1077                     pu1_dst_tmp1 += dst_strd;
1078 
1079                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1080                     if(row != 8)
1081                         vst1_u8(pu1_dst_tmp1, dc_val_t);
1082                     pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
1083                 }
1084             }
1085             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1086         }
1087     }
1088     /* loops have been unrolld considering the fact width is multiple of 4  */
1089     else
1090     {
1091         WORD32 acc_dc;
1092         two_nt = 2 * nt;
1093 
1094         acc_dc = 0;
1095         pu1_ref_tmp = pu1_ref + nt + 1;
1096         for(i = nt; i < two_nt; i++)
1097         {
1098             acc_dc += pu1_ref[i];
1099             acc_dc += pu1_ref_tmp[i];
1100         }
1101         dc_val = (acc_dc + nt) >> (log2nt_plus1);
1102         two_dc_val = 2 * dc_val;
1103         three_dc_val = 3 * dc_val;
1104         three_dc_val = three_dc_val + 2;
1105         dc_val_t = vdup_n_u8(dc_val);
1106 
1107         if(nt == 32)
1108         {
1109             pu1_dst_tmp = pu1_dst;
1110             for(row = 0; row < nt; row++)
1111             {
1112                 for(col = nt; col > 0; col -= 4)
1113                 {
1114                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1115                     pu1_dst_tmp += 4;
1116                 }
1117                 pu1_dst_tmp += dst_strd - nt;
1118             }
1119         }
1120         else
1121 
1122         {
1123             for(col = 1; col < nt; col++)
1124             {
1125                 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
1126             }
1127 
1128             pu1_dst_tmp = pu1_dst + dst_strd + 0;
1129             /* Since first row is already updated before, loop count is nt-1 */
1130             for(row = nt - 1; row > 0; row -= 1)
1131             {
1132                 for(col = nt; col > 0; col -= 4)
1133                 {
1134                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1135                     pu1_dst_tmp += 4;
1136                 }
1137                 pu1_dst_tmp += dst_strd - nt;
1138             }
1139 
1140             for(row = 1; row < nt; row++)
1141             {
1142                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
1143             }
1144             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1145         }
1146     }
1147 }
1148 /* INTRA_PRED_LUMA_DC */
1149 
1150 /**
1151 *******************************************************************************
1152 *
1153 * @brief
1154  *   Intra prediction interpolation filter for horizontal luma variable.
1155  *
1156  * @par Description:
1157  *   Horizontal intraprediction with reference neighboring  samples location
1158  *   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
1159  *
1160  * @param[in] pu1_src
1161  *  UWORD8 pointer to the source
1162  *
1163  * @param[out] pu1_dst
1164  *  UWORD8 pointer to the destination
1165  *
1166  * @param[in] src_strd
1167  *  integer source stride
1168  *
1169  * @param[in] dst_strd
1170  *  integer destination stride
1171  *
1172  * @param[in] nt
1173  *  integer Transform Block size
1174  *
1175  * @param[in] wd
1176  *  integer width of the array
1177  *
1178  * @returns
1179  *
1180  * @remarks
1181  *  None
1182  *
1183  *******************************************************************************
1184  */
1185 
ihevc_intra_pred_luma_horz_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1186 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
1187                                          WORD32 src_strd,
1188                                          UWORD8 *pu1_dst,
1189                                          WORD32 dst_strd,
1190                                          WORD32 nt,
1191                                          WORD32 mode)
1192 {
1193 
1194     WORD32 row, col;
1195     WORD32 two_nt;
1196     UNUSED(src_strd);
1197     UNUSED(mode);
1198 
1199     two_nt = 2 * nt;
1200 
1201 
1202     UWORD8 *pu1_dst_tmp = pu1_dst;
1203     UWORD32 pu1_val;
1204     uint8x8_t pu1_val_two_nt_1_row;
1205     if(nt == 32)
1206     {
1207         pu1_dst_tmp = pu1_dst;
1208         for(row = 0; row < nt; row++)
1209         {
1210             pu1_val = pu1_ref[two_nt - 1 - row];
1211             pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
1212             for(col = nt; col > 0; col -= 8)
1213             {
1214                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
1215                 pu1_dst_tmp += 8;
1216             }
1217             pu1_dst_tmp += dst_strd - nt;
1218         }
1219     }
1220     else
1221 
1222 
1223     /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
1224     /* naming of variables made according to the operation(instructions) it performs*/
1225     /* (eg. shift_val which contains the shifted value,                             */
1226     /* add_sat which has add and saturated value)                                   */
1227     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1228     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1229     {
1230         if(0 != (nt & 7))      /* cond for multiple of 4 */
1231         {
1232             UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
1233             UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
1234             UWORD8 *pu1_dst_4 = pu1_dst;
1235             UWORD8 *pu1_dst_4_tmp = pu1_dst;
1236 
1237             uint32x2_t pu1_ref_val1, pu1_ref_val2;
1238             uint8x8_t dup_sub, round_val, dup_val;
1239             uint16x8_t dup_add, sub_val;
1240             int16x8_t shift_val, add_sat;
1241 
1242             pu1_ref_val1 = vdup_n_u32(0);
1243             pu1_ref_val2 = vdup_n_u32(0);
1244 
1245             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1246 
1247             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1248 
1249             pu1_ref_4_two_nt_plus1 += (two_nt + 1);
1250 
1251             pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
1252 
1253             for(row = nt; row > 0; row -= 4)
1254             {
1255                 for(col = nt; col > 0; col -= 4)
1256                 {
1257                     pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
1258                     sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
1259                     shift_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1260 
1261                     add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
1262                     round_val = vqmovun_s16(add_sat);
1263                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
1264                     pu1_dst_4 += dst_strd;
1265 
1266                     pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
1267                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
1268                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1269                     pu1_dst_4 += dst_strd;
1270 
1271                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
1272                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1273                     pu1_dst_4 += dst_strd;
1274 
1275                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
1276                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1277                     pu1_dst_4 += dst_strd;
1278 
1279 
1280                 }
1281                 /* worst cases */
1282                 pu1_ref_4_two_nt_minus_nt += 3;
1283                 pu1_ref_4_two_nt_plus1 += 4;
1284                 pu1_dst_4 = (pu1_dst_4_tmp + 4);
1285             }
1286 
1287         }
1288 
1289         /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
1290         /* naming of variables made according to the operation(instructions) it performs    */
1291         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1292         /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
1293 
1294         else
1295         {
1296             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1297             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1298 
1299             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1300             UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
1301             UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
1302 
1303             uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
1304             uint16x8_t sub_res, dup_add;
1305             int16x8_t shift_res, add_res;
1306 
1307             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1308             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1309 
1310             pu1_ref_tmp_1 += (two_nt + 1);
1311             pu1_ref_tmp_2 += (two_nt - 1);
1312 
1313             for(col = nt; col > 0; col -= 8)
1314             {
1315                 src_tmp = vld1_u8(pu1_ref_tmp_1);
1316                 pu1_ref_tmp_1 += 8;
1317 
1318                 sub_res = vsubl_u8(src_tmp, dup_sub);
1319                 shift_res  = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
1320                 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
1321                 round_val = vqmovun_s16(add_res);
1322                 vst1_u8(pu1_dst_tmp_1, round_val);
1323                 pu1_dst_tmp_1 += 8;
1324             }
1325 
1326             for(row = nt; row > 0; row -= 8)
1327             {
1328                 pu1_ref_tmp_2 -= 8;
1329 
1330                 src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
1331                 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
1332 
1333                 dup_1 = vdup_lane_u8(rev_res, 0);
1334                 dup_2 = vdup_lane_u8(rev_res, 1);
1335                 dup_3 = vdup_lane_u8(rev_res, 2);
1336                 dup_4 = vdup_lane_u8(rev_res, 3);
1337                 dup_5 = vdup_lane_u8(rev_res, 4);
1338                 dup_6 = vdup_lane_u8(rev_res, 5);
1339                 dup_7 = vdup_lane_u8(rev_res, 6);
1340                 dup_8 = vdup_lane_u8(rev_res, 7);
1341 
1342                 for(col = nt; col > 0; col -= 8)
1343                 {
1344                     pu1_dst_tmp_2 = pu1_dst_tmp_3;
1345 
1346                     vst1_u8(pu1_dst_tmp_2, dup_1);
1347                     pu1_dst_tmp_2 += dst_strd;
1348 
1349                     vst1_u8(pu1_dst_tmp_2, dup_2);
1350                     pu1_dst_tmp_2 += dst_strd;
1351 
1352                     vst1_u8(pu1_dst_tmp_2, dup_3);
1353                     pu1_dst_tmp_2 += dst_strd;
1354 
1355                     vst1_u8(pu1_dst_tmp_2, dup_4);
1356                     pu1_dst_tmp_2 += dst_strd;
1357 
1358                     vst1_u8(pu1_dst_tmp_2, dup_5);
1359                     pu1_dst_tmp_2 += dst_strd;
1360 
1361                     vst1_u8(pu1_dst_tmp_2, dup_6);
1362                     pu1_dst_tmp_2 += dst_strd;
1363 
1364                     vst1_u8(pu1_dst_tmp_2, dup_7);
1365                     pu1_dst_tmp_2 += dst_strd;
1366 
1367                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1368                     if(row != 8)
1369                         vst1_u8(pu1_dst_tmp_2, dup_8);
1370                     pu1_dst_tmp_2 += dst_strd;
1371 
1372                     pu1_dst_tmp_3 += 8;
1373                 }
1374                 pu1_dst_tmp_2 -= (nt - 8);
1375                 pu1_dst_tmp_3 = pu1_dst_tmp_2;
1376             }
1377         }
1378     }
1379 }
1380 /* INTRA_PRED_LUMA_HORZ */
1381 
1382 /**
1383 *******************************************************************************
1384 *
1385 * @brief
1386 *    Intra prediction interpolation filter for vertical luma variable.
1387 *
1388 * @par Description:
1389 *    Horizontal intraprediction with reference neighboring  samples location
1390 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
1391 *
1392 * @param[in] pu1_src
1393 *  UWORD8 pointer to the source
1394 *
1395 * @param[out] pu1_dst
1396 *  UWORD8 pointer to the destination
1397 *
1398 * @param[in] src_strd
1399 *  integer source stride
1400 *
1401 * @param[in] dst_strd
1402 *  integer destination stride
1403 *
1404 * @param[in] nt
1405 *  integer Transform Block size
1406 *
1407 * @param[in] wd
1408 *  integer width of the array
1409 *
1410 * @returns
1411 *
1412 * @remarks
1413 *  None
1414 *
1415 *******************************************************************************
1416 */
1417 
ihevc_intra_pred_luma_ver_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1418 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
1419                                         WORD32 src_strd,
1420                                         UWORD8 *pu1_dst,
1421                                         WORD32 dst_strd,
1422                                         WORD32 nt,
1423                                         WORD32 mode)
1424 {
1425     WORD32 row, col;
1426     WORD32 two_nt;
1427     UNUSED(src_strd);
1428     UNUSED(mode);
1429 
1430     two_nt = 2 * nt;
1431 
1432     UWORD8 *pu1_dst_tmp = pu1_dst;
1433     UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
1434     uint8x8_t pu1_val_two_nt_1_col;
1435     if(nt == 32)
1436     {
1437         pu1_dst_tmp = pu1_dst;
1438         for(row = 0; row < nt; row++)
1439         {
1440             for(col = nt; col > 0; col -= 8)
1441             {
1442                 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
1443                 pu1_ref_tmp_1 += 8;
1444                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
1445                 pu1_dst_tmp += 8;
1446             }
1447             pu1_ref_tmp_1 -= nt;
1448             pu1_dst_tmp += dst_strd - nt;
1449         }
1450     }
1451     else
1452 
1453     {
1454         /* naming of variables made according to the operation(instructions) it performs                    */
1455         /* (eg. shift_val which contains the shifted value,                                                 */
1456         /* add_sat which has add and saturated value)                                                       */
1457         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1458         /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1459 
1460         if(0 != (nt & 7))
1461         {
1462             WORD32 cond_4 = 0;
1463             UWORD8 *pu1_ref_val1 = pu1_ref;
1464             UWORD8 *pu1_ref_val2 = pu1_ref;
1465             UWORD8 *pu1_ref_val3 = pu1_ref;
1466 
1467             UWORD8 *pu1_dst_val1 = pu1_dst;
1468             UWORD8 *pu1_dst_val2 = pu1_dst;
1469             UWORD8 *pu1_dst_val3 = pu1_dst;
1470 
1471             uint8x8_t dup_2_sub, round_val, vext_val;
1472             uint16x8_t dup_2_add;
1473             uint32x2_t src_val1, src_val2, src_val3;
1474             uint16x8_t sub_val;
1475             int16x8_t shift_val1, add_sat;
1476             uint64x1_t shift_val2;
1477 
1478             src_val1 = vdup_n_u32(0);
1479             src_val2 = vdup_n_u32(0);
1480             src_val3 = vdup_n_u32(0);
1481             pu1_ref_val1 += (two_nt - nt);
1482             pu1_ref_val3 += (two_nt + 2);
1483             pu1_ref_val2 += (two_nt + 1);
1484 
1485             dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
1486             dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1487 
1488             /* loops to store the first nt sets of values in the destination */
1489 
1490             for(row = nt; row > 0; row -= 4)
1491             {
1492                 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
1493                 {
1494                     /*  unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
1495                     src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
1496                     sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
1497                     shift_val1  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1498                     add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
1499                     round_val = vqmovun_s16(add_sat);
1500 
1501                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1502                     src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
1503                     vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
1504                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1505                     pu1_dst_val1 += dst_strd;
1506 
1507                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1508 
1509                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1510                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1511                     pu1_dst_val1 += dst_strd;
1512 
1513                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1514 
1515                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1516                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1517                     pu1_dst_val1 += dst_strd;
1518 
1519                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1520 
1521                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1522                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1523                     pu1_dst_val1 += dst_strd;
1524 
1525                     pu1_ref_val1  -= 4;
1526                 }
1527 
1528                 /* loop to store next sets of eight values in the destination */
1529 
1530                 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
1531                 {
1532                     src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
1533 
1534                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1535                     pu1_dst_val2 += dst_strd;
1536 
1537                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1538                     pu1_dst_val2 += dst_strd;
1539 
1540                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1541                     pu1_dst_val2 += dst_strd;
1542 
1543                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1544                     pu1_dst_val2 += dst_strd;
1545                 }
1546                 pu1_ref_val2 += 4;
1547                 pu1_dst_val3 += 4;
1548                 pu1_dst_val2 = pu1_dst_val3;
1549                 cond_4 = 1;
1550             }
1551         }
1552 
1553         /* rows and columns are unrolled by 8, when the width is multiple of 8          */
1554         else
1555         {
1556             WORD32 cond = 0, col_1;
1557             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1558             UWORD8 *pu1_dst_tmp_2 = pu1_dst;
1559             UWORD8 *pu1_dst_tmp_3 = pu1_dst;
1560 
1561             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1562             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1563             UWORD8 *pu1_ref_tmp_3 = pu1_ref;
1564 
1565             uint8x8_t pu1_src_tmp1;
1566             uint8x8_t pu1_src_tmp2;
1567 
1568             uint8x8_t dup_sub;
1569             uint16x8_t dup_add;
1570             int16x8_t subsh_val;
1571             int16x8_t addsat_val;
1572             uint16x8_t sub_val;
1573             uint8x8_t round_val;
1574             uint8x8_t vext_t;
1575             uint64x1_t shift_64;
1576 
1577             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1578             dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1579 
1580             pu1_ref_tmp_1 += (two_nt);
1581             pu1_ref_tmp_1 -= 8;
1582             pu1_ref_tmp_2 += (two_nt + 2);
1583             pu1_ref_tmp_3 += (two_nt + 1);
1584 
1585             /* loops to store the first nt sets of values in the destination */
1586 
1587             for(row = nt; row > 0; row -= 8)
1588             {
1589                 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
1590                 {
1591                     pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
1592 
1593                     sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
1594                     subsh_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1595                     addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
1596                     round_val = vqmovun_s16(addsat_val);
1597 
1598                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1599 
1600                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
1601                     vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
1602                     vst1_u8(pu1_dst_tmp_1, vext_t);
1603                     pu1_dst_tmp_1 += dst_strd;
1604 
1605                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1606 
1607                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1608                     vst1_u8(pu1_dst_tmp_1, vext_t);
1609                     pu1_dst_tmp_1 += dst_strd;
1610 
1611                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1612                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1613                     vst1_u8(pu1_dst_tmp_1, vext_t);
1614                     pu1_dst_tmp_1 += dst_strd;
1615 
1616                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1617                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1618                     vst1_u8(pu1_dst_tmp_1, vext_t);
1619                     pu1_dst_tmp_1 += dst_strd;
1620 
1621                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
1622                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1623                     vst1_u8(pu1_dst_tmp_1, vext_t);
1624                     pu1_dst_tmp_1 += dst_strd;
1625 
1626                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
1627                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1628                     vst1_u8(pu1_dst_tmp_1, vext_t);
1629                     pu1_dst_tmp_1 += dst_strd;
1630 
1631                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
1632                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1633                     vst1_u8(pu1_dst_tmp_1, vext_t);
1634                     pu1_dst_tmp_1 += dst_strd;
1635 
1636                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
1637                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1638                     vst1_u8(pu1_dst_tmp_1, vext_t);
1639                     pu1_dst_tmp_1 += dst_strd;
1640 
1641                     pu1_ref_tmp_1 -= 8;
1642                 }
1643 
1644                 /* loop to store next sets of eight values in the destination */
1645 
1646                 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
1647                 {
1648                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
1649 
1650                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1651                     pu1_dst_tmp_2 += dst_strd;
1652 
1653                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1654                     pu1_dst_tmp_2 += dst_strd;
1655 
1656                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1657                     pu1_dst_tmp_2 += dst_strd;
1658 
1659                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1660                     pu1_dst_tmp_2 += dst_strd;
1661 
1662                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1663                     pu1_dst_tmp_2 += dst_strd;
1664 
1665                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1666                     pu1_dst_tmp_2 += dst_strd;
1667 
1668                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1669                     pu1_dst_tmp_2 += dst_strd;
1670 
1671                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1672                     pu1_dst_tmp_2 += dst_strd;
1673                 }
1674                 pu1_ref_tmp_3 += 8;
1675                 pu1_dst_tmp_3 += 8;
1676                 pu1_dst_tmp_2 = pu1_dst_tmp_3;
1677                 cond = 1;
1678             }
1679         }
1680     }
1681 }
1682 /* INTRA_PRED_LUMA_VER */
1683 
1684 /**
1685 *******************************************************************************
1686 *
1687 * @brief
1688 *    Intra prediction interpolation filter for luma mode2.
1689 *
1690 * @par Description:
1691 *    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
1692 *    location pointed by 'pu1_ref' to the  TU block location pointed by
1693 *    'pu1_dst'
1694 *
1695 * @param[in] pu1_src
1696 *  UWORD8 pointer to the source
1697 *
1698 * @param[out] pu1_dst
1699 *  UWORD8 pointer to the destination
1700 *
1701 * @param[in] src_strd
1702 *  integer source stride
1703 *
1704 * @param[in] dst_strd
1705 *  integer destination stride
1706 *
1707 * @param[in] nt
1708 *  integer Transform Block size
1709 *
1710 * @param[in] wd
1711 *  integer width of the array
1712 *
1713 * @returns
1714 *
1715 * @remarks
1716 *  None
1717 *
1718 *******************************************************************************
1719 */
1720 
ihevc_intra_pred_luma_mode2_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1721 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
1722                                           WORD32 src_strd,
1723                                           UWORD8 *pu1_dst,
1724                                           WORD32 dst_strd,
1725                                           WORD32 nt,
1726                                           WORD32 mode)
1727 {
1728 
1729     WORD32 row, col;
1730     WORD32 two_nt;
1731     UNUSED(src_strd);
1732     UNUSED(mode);
1733 
1734     /* rev_res naming has been made to have the reverse result value in it                              */
1735     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1736     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1737 
1738     if(0 != (nt & 7))
1739     {
1740         UWORD8 *pu1_ref_tmp = pu1_ref;
1741         UWORD8 *pu1_dst_tmp = pu1_dst;
1742         uint8x8_t pu1_src_val, rev_res;
1743         uint64x1_t shift_res;
1744 
1745         for(col = nt; col > 0; col -= 4)
1746         {
1747             for(row = nt; row > 0; row -= 4)
1748             {
1749                 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
1750 
1751                 pu1_src_val = vld1_u8(pu1_ref_tmp);
1752                 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
1753                 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
1754 
1755                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
1756                 pu1_dst_tmp += dst_strd;
1757 
1758                 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
1759                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1760                 pu1_dst_tmp += dst_strd;
1761 
1762                 shift_res = vshr_n_u64(shift_res, 8);
1763                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1764                 pu1_dst_tmp += dst_strd;
1765 
1766                 shift_res = vshr_n_u64(shift_res, 8);
1767                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1768                 pu1_dst_tmp += dst_strd;
1769             }
1770         }
1771     }
1772 
1773     /* rev_val_second, rev_val_first  to reverse the loaded values in order to get the values in right order */
1774     /* shift_64 to shift the reversed 2nd values to get the value what we need                               */
1775     /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
1776 
1777     else
1778     {
1779         UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
1780         UWORD8 *pu1_dst_tmp = pu1_dst;
1781         UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1782 
1783         uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
1784         uint64x1_t shift_val;
1785 
1786         two_nt = 2 * nt;
1787         pu1_ref_two_nt_minus2 += (two_nt);
1788         pu1_ref_two_nt_minus2 -= 8;
1789 
1790         for(col = nt; col > 0; col -= 8)
1791         {
1792             for(row = nt; row > 0; row -= 8)
1793             {
1794                 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
1795                 rev_val_first = vrev64_u8(pu1_src_val2);
1796 
1797                 pu1_ref_two_nt_minus2 -= 8;
1798                 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
1799                 rev_val_second = vrev64_u8(pu1_src_val1);
1800 
1801                 vext_t = vext_u8(rev_val_first, rev_val_second, 1);
1802                 vst1_u8(pu1_dst_tmp, vext_t);
1803                 pu1_dst_tmp += dst_strd;
1804 
1805                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
1806                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1807                 vst1_u8(pu1_dst_tmp, vext_t);
1808                 pu1_dst_tmp += dst_strd;
1809 
1810                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
1811                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1812                 vst1_u8(pu1_dst_tmp, vext_t);
1813                 pu1_dst_tmp += dst_strd;
1814 
1815                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
1816                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1817                 vst1_u8(pu1_dst_tmp, vext_t);
1818                 pu1_dst_tmp += dst_strd;
1819 
1820                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
1821                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1822                 vst1_u8(pu1_dst_tmp, vext_t);
1823                 pu1_dst_tmp += dst_strd;
1824 
1825                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
1826                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1827                 vst1_u8(pu1_dst_tmp, vext_t);
1828                 pu1_dst_tmp += dst_strd;
1829 
1830                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
1831                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1832                 vst1_u8(pu1_dst_tmp, vext_t);
1833                 pu1_dst_tmp += dst_strd;
1834 
1835                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
1836                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1837                 vst1_u8(pu1_dst_tmp, vext_t);
1838                 pu1_dst_tmp += dst_strd;
1839             }
1840             pu1_dst_tmp_plus8 += 8;
1841             pu1_dst_tmp = pu1_dst_tmp_plus8;
1842             pu1_ref_two_nt_minus2 += (nt - 8);
1843         }
1844     }
1845 }
1846 /* INTRA_PRED_LUMA_MODE2 */
1847 
1848 /**
1849 *******************************************************************************
1850 *
1851 * @brief
1852 *   Intra prediction interpolation filter for luma mode 18 & mode 34.
1853 *
1854 * @par Description:
1855 *    Intraprediction for mode 34 (ne angle) with reference  neighboring
1856 *    samples location pointed by 'pu1_ref' to the  TU block location pointed by
1857 *    'pu1_dst'
1858 *
1859 * @param[in] pu1_src
1860 *  UWORD8 pointer to the source
1861 *
1862 * @param[out] pu1_dst
1863 *  UWORD8 pointer to the destination
1864 *
1865 * @param[in] src_strd
1866 *  integer source stride
1867 *
1868 * @param[in] dst_strd
1869 *  integer destination stride
1870 *
1871 * @param[in] nt
1872 *  integer Transform Block size
1873 *
1874 * @param[in] wd
1875 *  integer width of the array
1876 *
1877 * @returns
1878 *
1879 * @remarks
1880 *  None
1881 *
1882 *******************************************************************************
1883 */
1884 
ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1885 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
1886                                                WORD32 src_strd,
1887                                                UWORD8 *pu1_dst,
1888                                                WORD32 dst_strd,
1889                                                WORD32 nt,
1890                                                WORD32 mode)
1891 {
1892 
1893     WORD32 row, col, idx;
1894     WORD32 intraPredAngle = 32;
1895     WORD32 two_nt;
1896     UNUSED(src_strd);
1897     two_nt = 2 * nt;
1898 
1899     UWORD8 *pu1_ref_tmp = pu1_ref;
1900     UWORD8 *pu1_ref_tmp1 = pu1_ref;
1901     UWORD8 *pu1_dst_tmp = pu1_dst;
1902     UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1903 
1904     uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
1905 
1906     /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref)   */
1907     /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue        */
1908     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8      */
1909     /* rows and columns are unrolled by 8, when the width is multiple of 8                                  */
1910     /* loops are maintained separately for mode18 and mode34                                                */
1911 
1912     /* cond to allow multiples of 8 */
1913     if(0 == (nt & 7))
1914     {
1915         if(mode == 34)
1916         {
1917             pu1_ref_tmp += (two_nt + 2);
1918 
1919             for(row = nt; row > 0; row -= 8)
1920             {
1921                 for(col = nt; col > 0; col -= 8)
1922                 {
1923                     /* Loading 1st eight values */
1924                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
1925                     pu1_ref_tmp += 8;
1926 
1927                     /* Loading next eight values */
1928                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1929 
1930                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1931                     vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
1932                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
1933                     pu1_dst_tmp += dst_strd;
1934 
1935                     vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
1936                     vst1_u8(pu1_dst_tmp, vext1);
1937                     pu1_dst_tmp += dst_strd;
1938 
1939                     vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
1940                     vst1_u8(pu1_dst_tmp, vext2);
1941                     pu1_dst_tmp += dst_strd;
1942 
1943                     vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
1944                     vst1_u8(pu1_dst_tmp, vext3);
1945                     pu1_dst_tmp += dst_strd;
1946 
1947                     vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
1948                     vst1_u8(pu1_dst_tmp, vext4);
1949                     pu1_dst_tmp += dst_strd;
1950 
1951                     vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
1952                     vst1_u8(pu1_dst_tmp, vext5);
1953                     pu1_dst_tmp += dst_strd;
1954 
1955                     vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
1956                     vst1_u8(pu1_dst_tmp, vext6);
1957                     pu1_dst_tmp += dst_strd;
1958 
1959                     vst1_u8(pu1_dst_tmp, vext7);
1960                     pu1_dst_tmp += dst_strd;
1961                 }
1962 
1963                 pu1_dst_tmp_plus8 += 8;
1964                 pu1_dst_tmp = pu1_dst_tmp_plus8;
1965                 pu1_ref_tmp -= (nt - 8);
1966             }
1967         }
1968         else /* Loop for mode 18 */
1969         {
1970             pu1_ref_tmp += (two_nt);
1971 
1972             for(row = nt; row > 0; row -= 8)
1973             {
1974                 for(col = nt; col > 0; col -= 8)
1975                 {
1976                     /* Loading 1st eight values */
1977                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
1978                     pu1_ref_tmp -= 8;
1979 
1980                     /* Loading next eight values */
1981                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1982 
1983                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1984                     vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
1985                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
1986                     pu1_dst_tmp += dst_strd;
1987 
1988                     vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
1989                     vst1_u8(pu1_dst_tmp, vext1);
1990                     pu1_dst_tmp += dst_strd;
1991 
1992                     vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
1993                     vst1_u8(pu1_dst_tmp, vext2);
1994                     pu1_dst_tmp += dst_strd;
1995 
1996                     vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
1997                     vst1_u8(pu1_dst_tmp, vext3);
1998                     pu1_dst_tmp += dst_strd;
1999 
2000                     vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
2001                     vst1_u8(pu1_dst_tmp, vext4);
2002                     pu1_dst_tmp += dst_strd;
2003 
2004                     vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
2005                     vst1_u8(pu1_dst_tmp, vext5);
2006                     pu1_dst_tmp += dst_strd;
2007 
2008                     vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
2009                     vst1_u8(pu1_dst_tmp, vext6);
2010                     pu1_dst_tmp += dst_strd;
2011 
2012                     vst1_u8(pu1_dst_tmp, vext7);
2013                     pu1_dst_tmp += dst_strd;
2014                 }
2015                 pu1_dst_tmp_plus8 += 8;
2016                 pu1_dst_tmp = pu1_dst_tmp_plus8;
2017                 pu1_ref_tmp += (nt + 8);
2018             }
2019         }
2020     }
2021 
2022     /* rows and columns are unrolled by 4, when the width is multiple of 4  */
2023 
2024     else /* loop for multiples of 4 */
2025     {
2026         uint8x8_t src_val1;
2027         uint8x8_t src_val2;
2028 
2029         if(mode == 18)
2030             intraPredAngle = -32;
2031         else if(mode == 34)
2032             intraPredAngle = 32;
2033 
2034         for(row = 0; row < nt; row += 2)
2035         {
2036             /* unrolling 2 rows */
2037             idx = ((row + 1) * intraPredAngle) >> 5;
2038             pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
2039             src_val1 = vld1_u8(pu1_ref_tmp);
2040 
2041             idx = ((row + 2) * intraPredAngle) >> 5;
2042             pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
2043             src_val2 = vld1_u8(pu1_ref_tmp1);
2044 
2045             /* unrolling 4 col */
2046             for(col = nt; col > 0; col -= 4)
2047             {
2048                 pu1_dst_tmp = pu1_dst;
2049                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
2050                 pu1_dst_tmp += dst_strd;
2051                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
2052                 pu1_dst += 4;
2053             }
2054             pu1_dst += 2 * dst_strd - nt;
2055         }
2056     }
2057 }
2058 /* INTRA_PRED_LUMA_MODE_18_34 */
2059 
2060 /**
2061  *******************************************************************************
2062  *
2063  * @brief
2064  *    Intra prediction interpolation filter for luma mode 3 to mode 9
2065  *
2066  * @par Description:
2067  *    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
2068  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2069  *    block location pointed by 'pu1_dst'
2070  *
2071  * @param[in] pu1_src
2072  *  UWORD8 pointer to the source
2073  *
2074  * @param[out] pu1_dst
2075  *  UWORD8 pointer to the destination
2076  *
2077  * @param[in] src_strd
2078  *  integer source stride
2079  *
2080  * @param[in] dst_strd
2081  *  integer destination stride
2082  *
2083  * @param[in] nt
2084  *  integer Transform Block size
2085  *
2086  * @param[in] mode
2087  *  integer intraprediction mode
2088  *
2089  * @returns
2090  *
2091  * @remarks
2092  *  None
2093  *
2094  *******************************************************************************
2095  */
2096 
2097 
ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2098 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
2099                                                 WORD32 src_strd,
2100                                                 UWORD8 *pu1_dst,
2101                                                 WORD32 dst_strd,
2102                                                 WORD32 nt,
2103                                                 WORD32 mode)
2104 {
2105 
2106     WORD32 row, col;
2107     WORD32 intra_pred_ang;
2108     WORD32 pos, fract = 100, fract_prev;
2109     UNUSED(src_strd);
2110     if(0 == (nt & 7))
2111     {
2112 
2113         UWORD8 *pu1_ref_main_idx = pu1_ref;
2114         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2115 
2116         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2117         UWORD8 *pu1_dst_tmp2 = pu1_dst;
2118 
2119         WORD32 two_nt = 2 * nt;
2120 
2121         pu1_ref_main_idx += two_nt;
2122         pu1_ref_main_idx_1 += two_nt - 1;
2123 
2124         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2125         uint8x8_t shift_res;
2126         uint16x8_t mul_res1, mul_res2, add_res;
2127 
2128         /* Intra Pred Angle according to the mode */
2129         intra_pred_ang = gai4_ihevc_ang_table[mode];
2130 
2131         pu1_ref_main_idx -= 8;
2132         pu1_ref_main_idx_1 -= 8;
2133 
2134         for(col = 0; col < nt; col++)
2135         {
2136             fract_prev = fract;
2137 
2138             pos = ((col + 1) * intra_pred_ang);
2139             fract = pos & (31);
2140 
2141             if(fract_prev < fract)
2142             {
2143                 pu1_ref_main_idx += 1;
2144                 pu1_ref_main_idx_1 += 1;
2145             }
2146 
2147             dup_const_fract = vdup_n_u8((uint8_t)fract);
2148             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2149 
2150             for(row = nt; row > 0; row -= 8)
2151             {
2152                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2153                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2154 
2155                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2156                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2157 
2158                 add_res = vaddq_u16(mul_res1, mul_res2);
2159 
2160                 shift_res = vrshrn_n_u16(add_res, 5);
2161 
2162                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2163                 pu1_dst_tmp1 += dst_strd;
2164 
2165                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2166                 pu1_dst_tmp1 += dst_strd;
2167 
2168                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2169                 pu1_dst_tmp1 += dst_strd;
2170 
2171                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2172                 pu1_dst_tmp1 += dst_strd;
2173 
2174                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2175                 pu1_dst_tmp1 += dst_strd;
2176 
2177                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2178                 pu1_dst_tmp1 += dst_strd;
2179 
2180                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2181                 pu1_dst_tmp1 += dst_strd;
2182 
2183                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2184                 pu1_dst_tmp1 += dst_strd;
2185 
2186                 pu1_ref_main_idx -= 8;
2187                 pu1_ref_main_idx_1 -= 8;
2188 
2189             }
2190             pu1_dst_tmp2 += 1;
2191             pu1_dst_tmp1 = pu1_dst_tmp2;
2192 
2193             pu1_ref_main_idx += nt;
2194             pu1_ref_main_idx_1 += nt;
2195 
2196             pu1_ref_main_idx -= 1;
2197             pu1_ref_main_idx_1 -= 1;
2198 
2199         }
2200     }
2201     else
2202     {
2203         UWORD8 *pu1_ref_tmp1 = pu1_ref;
2204         UWORD8 *pu1_ref_tmp2 = pu1_ref;
2205         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2206         UWORD8 *pu1_dst_tmp2 = pu1_dst;
2207 
2208         pu1_ref_tmp1 += nt;
2209         pu1_ref_tmp2 += (nt - 1);
2210 
2211         uint8x8_t dup_fract, dup_32_fract, shift_res;
2212         uint16x8_t mul_res1, mul_res2, add_res;
2213         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
2214 
2215         pu1_ref_val1 = vdup_n_u32(0);
2216         pu1_ref_val2 = vdup_n_u32(0);
2217 
2218         /* Intra Pred Angle according to the mode */
2219         intra_pred_ang = gai4_ihevc_ang_table[mode];
2220 
2221 
2222         for(col = 0; col < nt; col++)
2223         {
2224             fract_prev = fract;
2225             pos = ((col + 1) * intra_pred_ang);
2226             fract = pos & (31);
2227             if(fract_prev < fract)
2228             {
2229                 pu1_ref_tmp1 += 1;
2230                 pu1_ref_tmp2 += 1;
2231             }
2232             dup_fract = vdup_n_u8((uint8_t)fract);
2233             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2234 
2235             for(row = nt; row > 0; row -= 4)
2236             {
2237                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2238                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2239 
2240                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2241                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2242 
2243                 add_res = vaddq_u16(mul_res1, mul_res2);
2244 
2245                 shift_res = vrshrn_n_u16(add_res, 5);
2246 
2247                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2248                 pu1_dst_tmp1 += dst_strd;
2249 
2250                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2251                 pu1_dst_tmp1 += dst_strd;
2252 
2253                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2254                 pu1_dst_tmp1 += dst_strd;
2255 
2256                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2257 
2258             }
2259             pu1_ref_tmp1 -= 1;
2260             pu1_ref_tmp2 -= 1;
2261 
2262             pu1_dst_tmp2 += 1;
2263             pu1_dst_tmp1 = pu1_dst_tmp2;
2264 
2265         }
2266 
2267 
2268     }
2269 
2270 }
2271 
2272 /**
2273  *******************************************************************************
2274  *
2275  * @brief
2276  *   Intra prediction interpolation filter for luma mode 11 to mode 17
2277  *
2278  * @par Description:
2279  *    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
2280  *    with reference  neighboring samples location pointed by 'pu1_ref' to the
2281  *    TU block location pointed by 'pu1_dst'
2282  *
2283  * @param[in] pu1_src
2284  *  UWORD8 pointer to the source
2285  *
2286  * @param[out] pu1_dst
2287  *  UWORD8 pointer to the destination
2288  *
2289  * @param[in] src_strd
2290  *  integer source stride
2291  *
2292  * @param[in] dst_strd
2293  *  integer destination stride
2294  *
2295  * @param[in] nt
2296  *  integer Transform Block size
2297  *
2298  * @param[in] mode
2299  *  integer intraprediction mode
2300  *
2301  * @returns
2302  *
2303  * @remarks
2304  *  None
2305  *
2306  *******************************************************************************
2307  */
2308 
2309 
ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2310 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
2311                                                   WORD32 src_strd,
2312                                                   UWORD8 *pu1_dst,
2313                                                   WORD32 dst_strd,
2314                                                   WORD32 nt,
2315                                                   WORD32 mode)
2316 {
2317 
2318     WORD32 row, col, k;
2319     WORD32 two_nt;
2320     WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
2321     WORD32 pos, fract = 1000, fract_prev;
2322     WORD32  ref_idx;
2323 
2324     UWORD8 *ref_main;
2325     UWORD8 *ref_main_tmp;
2326 
2327     UWORD8 *pu1_ref_tmp1 = pu1_ref;
2328     UWORD8 *pu1_ref_tmp2 = pu1_ref;
2329     UWORD8 *pu1_dst_tmp1 = pu1_dst;
2330     UWORD8 *pu1_dst_tmp2 = pu1_dst;
2331 
2332     UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
2333 
2334     uint16x8_t mul_res1, mul_res2, add_res;
2335     uint8x8_t dup_const_fract, dup_const_32_fract;
2336     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2337     uint8x8_t ref_left_t;
2338     uint32x2_t  ref_left_tmp;
2339     UNUSED(src_strd);
2340     ref_left_tmp = vdup_n_u32(0);
2341 
2342     inv_ang_sum = 128;
2343     two_nt = 2 * nt;
2344 
2345     intra_pred_ang = gai4_ihevc_ang_table[mode];
2346 
2347     inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
2348 
2349     pu1_ref_tmp1 += two_nt;
2350 
2351     ref_main = ref_temp + (nt - 1);
2352     ref_main_tmp = ref_main;
2353 
2354     if(0 == (nt & 7))
2355     {
2356         pu1_ref_tmp2 += (two_nt - 7);
2357 
2358         for(k = nt - 1; k >= 0; k -= 8)
2359         {
2360 
2361             ref_left_t = vld1_u8(pu1_ref_tmp2);
2362 
2363             ref_left_t = vrev64_u8(ref_left_t);
2364             vst1_u8(ref_main_tmp, ref_left_t);
2365             ref_main_tmp += 8;
2366             pu1_ref_tmp2 -= 8;
2367 
2368         }
2369 
2370     }
2371     else
2372     {
2373         uint8x8_t rev_val;
2374         pu1_ref_tmp2 += (two_nt - (nt - 1));
2375 
2376         for(k = nt - 1; k >= 0; k -= 8)
2377         {
2378 
2379             ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
2380 
2381             rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
2382             vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
2383 
2384         }
2385 
2386     }
2387 
2388     ref_main[nt] = pu1_ref[two_nt - nt];
2389 
2390     /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
2391 
2392     ref_idx = (nt * intra_pred_ang) >> 5;
2393 
2394     /* SIMD Optimization can be done using look-up table for the loop */
2395     /* For negative angled derive the main reference samples from side */
2396     /*  reference samples refer to section 8.4.4.2.6 */
2397     for(k = -1; k > ref_idx; k--)
2398     {
2399         inv_ang_sum += inv_ang;
2400         ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
2401     }
2402 
2403     UWORD8 *ref_main_tmp1 = ref_main;
2404     UWORD8 *ref_main_tmp2 = ref_main;
2405 
2406     ref_main_tmp2 += 1;
2407 
2408     if(0 == (nt & 7))
2409     {
2410         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2411         /* samples dependent on distance to obtain destination sample */
2412         for(col = 0; col < nt; col++)
2413         {
2414 
2415             fract_prev = fract;
2416             pos = ((col + 1) * intra_pred_ang);
2417             fract = pos & (31);
2418 
2419             if(fract_prev < fract)
2420             {
2421                 ref_main_tmp1 -= 1;
2422                 ref_main_tmp2 -= 1;
2423             }
2424 
2425             dup_const_fract = vdup_n_u8((uint8_t)fract);
2426             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2427 
2428             // Do linear filtering
2429             for(row = nt; row > 0; row -= 8)
2430             {
2431                 ref_main_idx = vld1_u8(ref_main_tmp1);
2432 
2433                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2434 
2435                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2436                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2437 
2438                 add_res = vaddq_u16(mul_res1, mul_res2);
2439 
2440                 shift_res = vrshrn_n_u16(add_res, 5);
2441 
2442                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2443                 pu1_dst_tmp1 += dst_strd;
2444 
2445                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2446                 pu1_dst_tmp1 += dst_strd;
2447 
2448                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2449                 pu1_dst_tmp1 += dst_strd;
2450 
2451                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2452                 pu1_dst_tmp1 += dst_strd;
2453 
2454                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2455                 pu1_dst_tmp1 += dst_strd;
2456 
2457                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2458                 pu1_dst_tmp1 += dst_strd;
2459 
2460                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2461                 pu1_dst_tmp1 += dst_strd;
2462 
2463                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2464                 pu1_dst_tmp1 += dst_strd;
2465 
2466                 ref_main_tmp1 += 8;
2467                 ref_main_tmp2 += 8;
2468             }
2469 
2470             ref_main_tmp1 -= nt;
2471             ref_main_tmp2 -= nt;
2472 
2473             pu1_dst_tmp2 += 1;
2474             pu1_dst_tmp1 = pu1_dst_tmp2;
2475         }
2476     }
2477     else
2478     {
2479         uint32x2_t ref_main_idx1, ref_main_idx2;
2480 
2481         ref_main_idx1 = vdup_n_u32(0);
2482         ref_main_idx2 = vdup_n_u32(0);
2483 
2484         for(col = 0; col < nt; col++)
2485         {
2486             fract_prev = fract;
2487             pos = ((col + 1) * intra_pred_ang);
2488             fract = pos & (31);
2489 
2490             if(fract_prev < fract)
2491             {
2492                 ref_main_tmp1 -= 1;
2493                 ref_main_tmp2 -= 1;
2494             }
2495 
2496             dup_const_fract = vdup_n_u8((uint8_t)fract);
2497             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2498 
2499             for(row = nt; row > 0; row -= 4)
2500             {
2501 
2502                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2503                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2504 
2505                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2506                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2507 
2508                 add_res = vaddq_u16(mul_res1, mul_res2);
2509 
2510                 shift_res = vrshrn_n_u16(add_res, 5);
2511 
2512                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2513                 pu1_dst_tmp1 += dst_strd;
2514 
2515                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2516                 pu1_dst_tmp1 += dst_strd;
2517 
2518                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2519                 pu1_dst_tmp1 += dst_strd;
2520 
2521                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2522                 pu1_dst_tmp1 += dst_strd;
2523 
2524             }
2525 
2526             pu1_dst_tmp2 += 1;
2527             pu1_dst_tmp1 = pu1_dst_tmp2;
2528 
2529         }
2530 
2531     }
2532 }
2533 
2534 /**
2535  *******************************************************************************
2536  *
2537  * @brief
2538  *   Intra prediction interpolation filter for luma mode 19 to mode 25
2539  *
2540  * @par Description:
2541  *    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
2542  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2543  *    block location pointed by 'pu1_dst'
2544  *
2545  * @param[in] pu1_src
2546  *  UWORD8 pointer to the source
2547  *
2548  * @param[out] pu1_dst
2549  *  UWORD8 pointer to the destination
2550  *
2551  * @param[in] src_strd
2552  *  integer source stride
2553  *
2554  * @param[in] dst_strd
2555  *  integer destination stride
2556  *
2557  * @param[in] nt
2558  *  integer Transform Block size
2559  *
2560  * @param[in] mode
2561  *  integer intraprediction mode
2562  *
2563  * @returns
2564  *
2565  * @remarks
2566  *  None
2567  *
2568  *******************************************************************************
2569  */
2570 
2571 
ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2572 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
2573                                                   WORD32 src_strd,
2574                                                   UWORD8 *pu1_dst,
2575                                                   WORD32 dst_strd,
2576                                                   WORD32 nt,
2577                                                   WORD32 mode)
2578 {
2579 
2580     WORD32 row, col, k;
2581     WORD32 two_nt, intra_pred_ang;
2582     WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
2583     WORD32 ref_idx;
2584     UWORD8 *ref_main;
2585     UWORD8 *ref_main_tmp;
2586     UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
2587 
2588     UWORD8 *pu1_ref_tmp1 = pu1_ref;
2589     UWORD8 *pu1_ref_tmp2 = pu1_ref;
2590     UWORD8 *pu1_dst_tmp1 = pu1_dst;
2591 
2592     uint16x8_t mul_res1, mul_res2, add_res;
2593     uint8x8_t dup_const_fract, dup_const_32_fract;
2594     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2595     uint8x8_t ref_above_t;
2596     uint32x2_t ref_above_tmp;
2597     UNUSED(src_strd);
2598     ref_above_tmp = vdup_n_u32(0);
2599 
2600     two_nt = 2 * nt;
2601     intra_pred_ang = gai4_ihevc_ang_table[mode];
2602     inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
2603 
2604     /* Intermediate reference samples for negative angle modes */
2605     /* This have to be removed during optimization*/
2606     pu1_ref_tmp1 += two_nt;
2607 
2608 
2609     ref_main = ref_temp + (nt - 1);
2610     ref_main_tmp = ref_main;
2611 
2612     if(0 == (nt & 7))
2613     {
2614         pu1_ref_tmp2 += (two_nt - 7);
2615         for(k = nt - 1; k >= 0; k -= 8)
2616         {
2617 
2618             ref_above_t = vld1_u8(pu1_ref_tmp1);
2619             vst1_u8(ref_main_tmp, ref_above_t);
2620             ref_main_tmp += 8;
2621             pu1_ref_tmp1 += 8;
2622 
2623         }
2624 
2625     }
2626     else
2627     {
2628         pu1_ref_tmp2 += (two_nt - (nt - 1));
2629 
2630         for(k = nt - 1; k >= 0; k -= 4)
2631         {
2632 
2633             ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
2634             vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
2635 
2636         }
2637 
2638     }
2639 
2640     ref_main[nt] = pu1_ref[two_nt + nt];
2641 
2642     /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
2643 
2644     ref_idx = (nt * intra_pred_ang) >> 5;
2645     inv_ang_sum = 128;
2646 
2647     /* SIMD Optimization can be done using look-up table for the loop */
2648     /* For negative angled derive the main reference samples from side */
2649     /*  reference samples refer to section 8.4.4.2.6 */
2650     for(k = -1; k > ref_idx; k--)
2651     {
2652         inv_ang_sum += inv_ang;
2653         ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
2654     }
2655 
2656     UWORD8 *ref_main_tmp1 = ref_main;
2657     UWORD8 *ref_main_tmp2 = ref_main;
2658 
2659     ref_main_tmp2 += 1;
2660 
2661     if(0 == (nt & 7))
2662     {
2663         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2664         /* samples dependent on distance to obtain destination sample */
2665         for(row = 0; row < nt; row++)
2666         {
2667 
2668             fract_prev = fract;
2669             pos = ((row + 1) * intra_pred_ang);
2670             fract = pos & (31);
2671 
2672             if(fract_prev < fract)
2673             {
2674                 ref_main_tmp1 -= 1;
2675                 ref_main_tmp2 -= 1;
2676             }
2677 
2678             dup_const_fract = vdup_n_u8((uint8_t)fract);
2679             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2680 
2681             // Do linear filtering
2682             for(col = nt; col > 0; col -= 8)
2683             {
2684                 ref_main_idx = vld1_u8(ref_main_tmp1);
2685 
2686                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2687 
2688                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2689                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2690 
2691                 add_res = vaddq_u16(mul_res1, mul_res2);
2692 
2693                 shift_res = vrshrn_n_u16(add_res, 5);
2694 
2695                 vst1_u8(pu1_dst_tmp1, shift_res);
2696                 pu1_dst_tmp1 += 8;
2697 
2698                 ref_main_tmp1 += 8;
2699                 ref_main_tmp2 += 8;
2700             }
2701 
2702             ref_main_tmp1 -= nt;
2703             ref_main_tmp2 -= nt;
2704 
2705             pu1_dst_tmp1 += (dst_strd - nt);
2706         }
2707     }
2708     else
2709     {
2710         uint32x2_t ref_main_idx1, ref_main_idx2;
2711 
2712         ref_main_idx1 = vdup_n_u32(0);
2713         ref_main_idx2 = vdup_n_u32(0);
2714 
2715         for(row = 0; row < nt; row++)
2716         {
2717             fract_prev = fract;
2718             pos = ((row + 1) * intra_pred_ang);
2719             fract = pos & (31);
2720 
2721             if(fract_prev < fract)
2722             {
2723                 ref_main_tmp1 -= 1;
2724                 ref_main_tmp2 -= 1;
2725             }
2726 
2727             dup_const_fract = vdup_n_u8((uint8_t)fract);
2728             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2729 
2730             for(col = nt; col > 0; col -= 4)
2731             {
2732 
2733                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2734                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2735 
2736                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2737                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2738 
2739                 add_res = vaddq_u16(mul_res1, mul_res2);
2740 
2741                 shift_res = vrshrn_n_u16(add_res, 5);
2742 
2743                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2744                 pu1_dst_tmp1 += 4;
2745 
2746             }
2747             pu1_dst_tmp1 += (dst_strd - nt);
2748         }
2749 
2750     }
2751 
2752 }
2753 
2754 /**
2755  *******************************************************************************
2756  *
2757  * @brief
2758  *    Intra prediction interpolation filter for luma mode 27 to mode 33
2759  *
2760  * @par Description:
2761  *    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
2762  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2763  *    block location pointed by 'pu1_dst'
2764  *
2765  * @param[in] pu1_src
2766  *  UWORD8 pointer to the source
2767  *
2768  * @param[out] pu1_dst
2769  *  UWORD8 pointer to the destination
2770  *
2771  * @param[in] src_strd
2772  *  integer source stride
2773  *
2774  * @param[in] dst_strd
2775  *  integer destination stride
2776  *
2777  * @param[in] nt
2778  *  integer Transform Block size
2779  *
2780  * @param[in] mode
2781  *  integer intraprediction mode
2782  *
2783  * @returns
2784  *
2785  * @remarks
2786  *  None
2787  *
2788  *******************************************************************************
2789  */
2790 
2791 
ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2792 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
2793                                                   WORD32 src_strd,
2794                                                   UWORD8 *pu1_dst,
2795                                                   WORD32 dst_strd,
2796                                                   WORD32 nt,
2797                                                   WORD32 mode)
2798 {
2799 
2800     WORD32 row, col;
2801     WORD32 intra_pred_ang;
2802     WORD32 pos, fract = 0, fract_prev;
2803 
2804     WORD32 two_nt = 2 * nt;
2805     UNUSED(src_strd);
2806     if(0 == (nt & 7))
2807     {
2808 
2809         UWORD8 *pu1_ref_main_idx = pu1_ref;
2810         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2811 
2812         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2813         pu1_ref_main_idx += (two_nt + 1);
2814         pu1_ref_main_idx_1 += (two_nt + 2);
2815 
2816         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2817         uint8x8_t shift_res;
2818         uint16x8_t mul_res1, mul_res2, add_res;
2819 
2820         /* Intra Pred Angle according to the mode */
2821         intra_pred_ang = gai4_ihevc_ang_table[mode];
2822 
2823         for(row = 0; row < nt; row++)
2824         {
2825             fract_prev = fract;
2826 
2827             pos = ((row + 1) * intra_pred_ang);
2828             fract = pos & (31);
2829 
2830             if(fract_prev > fract)
2831             {
2832                 pu1_ref_main_idx += 1;
2833                 pu1_ref_main_idx_1 += 1;
2834             }
2835 
2836             dup_const_fract = vdup_n_u8((uint8_t)fract);
2837             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2838 
2839             for(col = nt; col > 0; col -= 8)
2840             {
2841                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2842                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2843 
2844                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2845                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2846 
2847                 add_res = vaddq_u16(mul_res1, mul_res2);
2848 
2849                 shift_res = vrshrn_n_u16(add_res, 5);
2850 
2851                 vst1_u8(pu1_dst_tmp1, shift_res);
2852                 pu1_dst_tmp1 += 8;
2853 
2854                 pu1_ref_main_idx += 8;
2855                 pu1_ref_main_idx_1 += 8;
2856             }
2857 
2858             pu1_ref_main_idx -= nt;
2859             pu1_ref_main_idx_1 -= nt;
2860 
2861             pu1_dst_tmp1 += (dst_strd - nt);
2862         }
2863 
2864     }
2865     else
2866     {
2867         UWORD8 *pu1_ref_tmp1 = pu1_ref;
2868         UWORD8 *pu1_ref_tmp2 = pu1_ref;
2869         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2870 
2871         pu1_ref_tmp1 += (two_nt + 1);;
2872         pu1_ref_tmp2 += (two_nt + 2);;
2873 
2874         uint8x8_t dup_fract, dup_32_fract, shift_res;
2875         uint16x8_t mul_res1, mul_res2, add_res;
2876         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
2877 
2878         pu1_ref_val1 = vdup_n_u32(0);
2879         pu1_ref_val2 = vdup_n_u32(0);
2880 
2881         /* Intra Pred Angle according to the mode */
2882         intra_pred_ang = gai4_ihevc_ang_table[mode];
2883 
2884         for(row = 0; row < nt; row++)
2885         {
2886             fract_prev = fract;
2887             pos = ((row + 1) * intra_pred_ang);
2888             fract = pos & (31);
2889             if(fract_prev > fract)
2890             {
2891                 pu1_ref_tmp1 += 1;
2892                 pu1_ref_tmp2 += 1;
2893             }
2894             dup_fract = vdup_n_u8((uint8_t)fract);
2895             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2896 
2897             for(col = nt; col > 0; col -= 4)
2898             {
2899                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2900                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2901 
2902                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2903                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2904 
2905                 add_res = vaddq_u16(mul_res1, mul_res2);
2906 
2907                 shift_res = vrshrn_n_u16(add_res, 5);
2908 
2909                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2910                 pu1_dst_tmp1 += 4;
2911 
2912             }
2913 
2914             pu1_dst_tmp1 += (dst_strd - nt);
2915 
2916         }
2917 
2918 
2919     }
2920 
2921 }
2922