1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevc_intra_pred_filters_neon_intr.c
22 *
23 * @brief
24 *  Contains function Definition for intra prediction  interpolation filters
25 *
26 *
27 * @author
28 *  Yogeswaran RS
29 *
30 * @par List of Functions:
31 *  - ihevc_intra_pred_luma_planar()
32 *  - ihevc_intra_pred_luma_dc()
33 *  - ihevc_intra_pred_luma_horz()
34 *  - ihevc_intra_pred_luma_ver()
35 *  - ihevc_intra_pred_luma_mode2()
36 *  - ihevc_intra_pred_luma_mode_18_34()
37 *
38 * @remarks
39 *  None
40 *
41 *******************************************************************************
42 */
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 #include <stdio.h>
47 
48 #include "ihevc_typedefs.h"
49 #include "ihevc_intra_pred.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_func_selector.h"
52 #include "arm_neon.h"
53 #include "ihevc_platform_macros.h"
54 #include "ihevc_common_tables.h"
55 
56 /****************************************************************************/
57 /* Constant Macros                                                          */
58 /****************************************************************************/
59 #define MAX_CU_SIZE 64
60 #define BIT_DEPTH 8
61 #define T32_4NT 128
62 #define T16_4NT 64
63 
64 
65 
66 /*****************************************************************************/
67 /* Table Look-up                                                             */
68 /*****************************************************************************/
69 
70 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
71 
72 /*****************************************************************************/
73 /* Function Definition                                                      */
74 /*****************************************************************************/
75 
76 /**
77 *******************************************************************************
78 *
79 * @brief
80  *    Intra prediction interpolation filter for pu1_ref substitution
81  *
82  *
83  * @par Description:
84  *    Reference substitution process for samples unavailable  for prediction
85  *    Refer to section 8.4.4.2.2
86  *
87  * @param[in] pu1_top_left
88  *  UWORD8 pointer to the top-left
89  *
90  * @param[in] pu1_top
91  *  UWORD8 pointer to the top
92  *
93  * @param[in] pu1_left
94  *  UWORD8 pointer to the left
95  *
96  * @param[in] src_strd
97  *  WORD32 Source stride
98  *
99  * @param[in] nbr_flags
100  *  WORD32 neighbor availability flags
101  *
102  * @param[in] nt
103  *  WORD32 transform Block size
104  *
105  * @param[in] dst_strd
106  *  WORD32 Destination stride
107  *
108  * @returns
109  *
110  * @remarks
111  *  None
112  *
113  *******************************************************************************
114  */
115 
116 
ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 * pu1_top_left,UWORD8 * pu1_top,UWORD8 * pu1_left,WORD32 src_strd,WORD32 nt,WORD32 nbr_flags,UWORD8 * pu1_dst,WORD32 dst_strd)117 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
118                                                      UWORD8 *pu1_top,
119                                                      UWORD8 *pu1_left,
120                                                      WORD32 src_strd,
121                                                      WORD32 nt,
122                                                      WORD32 nbr_flags,
123                                                      UWORD8 *pu1_dst,
124                                                      WORD32 dst_strd)
125 {
126     UWORD8 pu1_ref;
127     WORD32 dc_val, i;
128     WORD32 total_samples = (4 * nt) + 1;
129     WORD32 two_nt = 2 * nt;
130     WORD32 three_nt = 3 * nt;
131     WORD32 get_bits;
132     WORD32 next;
133     WORD32 bot_left, left, top, tp_right, tp_left;
134     WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
135     UNUSED(dst_strd);
136     dc_val = 1 << (BIT_DEPTH - 1);
137 
138     /* Neighbor Flag Structure*/
139     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
140               1         4         4     4         4
141      */
142 
143     /* If no neighbor flags are present, fill the neighbor samples with DC value */
144     if(nbr_flags == 0)
145     {
146         for(i = 0; i < total_samples; i++)
147         {
148             pu1_dst[i] = dc_val;
149         }
150     }
151     else
152     {
153         /* Else fill the corresponding samples */
154         pu1_dst[two_nt] = *pu1_top_left;
155         UWORD8 *pu1_dst_tmp2 = pu1_dst;
156         UWORD8 *pu1_top_tmp = pu1_top;
157         pu1_dst_tmp2 += two_nt + 1;
158 
159         for(i = 0; i < two_nt; i++)
160             pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
161 
162         uint8x8_t src;
163         for(i = two_nt; i > 0; i -= 8)
164         {
165             src = vld1_u8(pu1_top_tmp);
166             pu1_top_tmp += 8;
167             vst1_u8(pu1_dst_tmp2, src);
168             pu1_dst_tmp2 += 8;
169         }
170 
171         if(nt <= 8)
172         {
173             /* 1 bit extraction for all the neighboring blocks */
174             tp_left = (nbr_flags & 0x10000) >> 16;
175             bot_left = nbr_flags & 0x1;
176             left = (nbr_flags & 0x10) >> 4;
177             top = (nbr_flags & 0x100) >> 8;
178             tp_right = (nbr_flags & 0x1000) >> 12;
179 
180             next = 1;
181 
182             /* If bottom -left is not available, reverse substitution process*/
183             if(bot_left == 0)
184             {
185                 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
186 
187                 /* Check for the 1st available sample from bottom-left*/
188                 while(!a_nbr_flag[next])
189                     next++;
190 
191                 /* If Left, top-left are available*/
192                 if(next <= 2)
193                 {
194                     idx = nt * next;
195                     pu1_ref = pu1_dst[idx];
196                     for(i = 0; i < idx; i++)
197                         pu1_dst[i] = pu1_ref;
198                 }
199                 else /* If top, top-right are available */
200                 {
201                     /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
202                     idx = (nt * (next - 1)) + 1;
203                     pu1_ref = pu1_dst[idx];
204                     for(i = 0; i < idx; i++)
205                         pu1_dst[i] = pu1_ref;
206                 }
207             }
208 
209             /* Forward Substitution Process */
210             /* If left is Unavailable, copy the last bottom-left value */
211 
212             if(left == 0)
213             {
214                 uint8x8_t dup_pu1_dst1;
215                 UWORD8 *pu1_dst_const_nt = pu1_dst;
216                 pu1_dst_const_nt += nt;
217 
218                 if(0 == (nt & 7))
219                 {
220                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
221                     for(i = nt; i > 0; i -= 8)
222                     {
223                         vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
224                         pu1_dst_const_nt += 8;
225 
226                     }
227                 }
228                 else
229                 {
230                     //uint32x2_t dup_pu1_dst4;
231                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
232                     //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
233                     for(i = nt; i > 0; i -= 4)
234                     {
235                         vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
236                         pu1_dst_const_nt += 4;
237 
238                     }
239 
240                 }
241 
242             }
243             if(tp_left == 0)
244                 pu1_dst[two_nt] = pu1_dst[two_nt - 1];
245             if(top == 0)
246             {
247 
248                 if(0 == (nt & 7))
249                 {
250                     uint8x8_t dup_pu1_dst2;
251                     UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
252                     pu1_dst_const_two_nt_1 += (two_nt + 1);
253                     dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
254                     for(i = nt; i > 0; i -= 8)
255                     {
256                         vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
257                         pu1_dst_const_two_nt_1 += 8;
258 
259                     }
260                 }
261                 else
262                 {
263                     for(i = 0; i < nt; i++)
264                         pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
265                 }
266             }
267             if(tp_right == 0)
268             {
269                 uint8x8_t dup_pu1_dst3;
270                 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
271                 pu1_dst_const_three_nt_1 += (three_nt + 1);
272                 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
273                 if(0 == (nt & 7))
274                 {
275                     for(i = nt; i > 0; i -= 8)
276                     {
277                         vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
278                         pu1_dst_const_three_nt_1 += 8;
279 
280                     }
281                 }
282                 else
283                 {
284                     for(i = nt; i > 0; i -= 4)
285                     {
286                         vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
287                         pu1_dst_const_three_nt_1 += 4;
288                     }
289 
290                 }
291 
292             }
293         }
294         if(nt == 16)
295         {
296             WORD32 nbr_flags_temp = 0;
297             nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
298                             + ((nbr_flags & 0x300) >> 4)
299                             + ((nbr_flags & 0x3000) >> 6)
300                             + ((nbr_flags & 0x10000) >> 8);
301 
302             /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
303             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
304             {
305                 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
306 
307                 if(nbr_id_from_bl == 64)
308                     nbr_id_from_bl = 32;
309 
310                 if(nbr_id_from_bl == 32)
311                 {
312                     /* for top left : 1 pel per nbr bit */
313                     if(!((nbr_flags_temp >> 8) & 0x1))
314                     {
315                         nbr_id_from_bl++;
316                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
317                     }
318                 }
319                 /* Reverse Substitution Process*/
320                 if(nbr_id_from_bl)
321                 {
322                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
323                     pu1_ref = pu1_dst[nbr_id_from_bl];
324                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
325                     {
326                         pu1_dst[i] = pu1_ref;
327                     }
328                 }
329             }
330 
331             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
332             while(nbr_id_from_bl < ((T16_4NT) + 1))
333             {
334                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
335                 /* Devide by 8 to obtain the original index */
336                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
337 
338                 /* The Top-left flag is at the last bit location of nbr_flags*/
339                 if(nbr_id_from_bl == (T16_4NT / 2))
340                 {
341                     get_bits = GET_BITS(nbr_flags_temp, 8);
342 
343                     /* only pel substitution for TL */
344                     if(!get_bits)
345                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
346                 }
347                 else
348                 {
349                     get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
350                     if(!get_bits)
351                     {
352                         /* 8 pel substitution (other than TL) */
353                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
354                         for(i = 0; i < 8; i++)
355                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
356                     }
357 
358                 }
359                 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
360             }
361         }
362 
363         if(nt == 32)
364         {
365             /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
366             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
367             {
368                 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
369 
370                 if(nbr_id_from_bl == 64)
371                 {
372                     /* for top left : 1 pel per nbr bit */
373                     if(!((nbr_flags >> 16) & 0x1))
374                     {
375                         /* top left not available */
376                         nbr_id_from_bl++;
377                         /* top and top right;  8 pels per nbr bit */
378                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
379                     }
380                 }
381                 /* Reverse Substitution Process*/
382                 if(nbr_id_from_bl)
383                 {
384                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
385                     pu1_ref = pu1_dst[nbr_id_from_bl];
386                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
387                         pu1_dst[i] = pu1_ref;
388                 }
389             }
390 
391             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
392             while(nbr_id_from_bl < ((T32_4NT)+1))
393             {
394                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
395                 /* Devide by 8 to obtain the original index */
396                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
397 
398                 /* The Top-left flag is at the last bit location of nbr_flags*/
399                 if(nbr_id_from_bl == (T32_4NT / 2))
400                 {
401                     get_bits = GET_BITS(nbr_flags, 16);
402                     /* only pel substitution for TL */
403                     if(!get_bits)
404                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
405                 }
406                 else
407                 {
408                     get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
409                     if(!get_bits)
410                     {
411                         /* 8 pel substitution (other than TL) */
412                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
413                         for(i = 0; i < 8; i++)
414                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
415                     }
416 
417                 }
418                 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
419             }
420         }
421 
422     }
423 
424 }
425 
426 /**
427  *******************************************************************************
428  *
429  * @brief
430  *    Intra prediction interpolation filter for ref_filtering
431  *
432  *
433  * @par Description:
434  *    Reference DC filtering for neighboring samples dependent  on TU size and
435  *    mode  Refer to section 8.4.4.2.3 in the standard
436  *
437  * @param[in] pu1_src
438  *  UWORD8 pointer to the source
439  *
440  * @param[out] pu1_dst
441  *  UWORD8 pointer to the destination
442  *
443  * @param[in] nt
444  *  integer Transform Block size
445  *
446  * @param[in] mode
447  *  integer intraprediction mode
448  *
449  * @returns
450  *
451  * @remarks
452  *  None
453  *
454  *******************************************************************************
455  */
456 
457 
ihevc_intra_pred_ref_filtering_neonintr(UWORD8 * pu1_src,WORD32 nt,UWORD8 * pu1_dst,WORD32 mode,WORD32 strong_intra_smoothing_enable_flag)458 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
459                                              WORD32 nt,
460                                              UWORD8 *pu1_dst,
461                                              WORD32 mode,
462                                              WORD32 strong_intra_smoothing_enable_flag)
463 {
464     WORD32 filter_flag;
465     WORD32 i = 0;
466     WORD32 four_nt = 4 * nt;
467 
468     WORD32 src_4nt;
469 
470     /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1   */
471     /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values        */
472     UWORD8 *pu1_src_tmp_0 = pu1_src;
473     UWORD8 *pu1_src_tmp_1;
474     UWORD8 *pu1_src_tmp_2;
475     UWORD8 *pu1_dst_tmp_0 = pu1_dst;
476     UWORD8 *pu1_dst_tmp_1;
477 
478     uint8x8_t src_val_0, src_val_2;
479     uint8x8_t src_val_1, shift_res;
480     uint8x8_t dup_const_2;
481     uint16x8_t mul_res, add_res;
482     WORD32 bi_linear_int_flag = 0;
483     WORD32 abs_cond_left_flag = 0;
484     WORD32 abs_cond_top_flag = 0;
485     WORD32 dc_val = 1 << (BIT_DEPTH - 5);
486     shift_res = vdup_n_u8(0);
487 
488     filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
489 
490     if(0 == filter_flag)
491     {
492         if(pu1_src == pu1_dst)
493         {
494             return;
495         }
496         else
497         {
498             for(i = four_nt; i > 0; i -= 8)
499             {
500                 src_val_0 = vld1_u8(pu1_src_tmp_0);
501                 pu1_src_tmp_0 += 8;
502                 vst1_u8(pu1_dst_tmp_0, src_val_0);
503                 pu1_dst_tmp_0 += 8;
504             }
505             pu1_dst[four_nt] = pu1_src[four_nt];
506         }
507     }
508 
509     else
510     {
511         /* If strong intra smoothin is enabled and transform size is 32 */
512         if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
513         {
514             /*Strong Intra Filtering*/
515             abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
516                             - (2 * pu1_src[3 * nt]))) < dc_val;
517             abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
518                             - (2 * pu1_src[nt]))) < dc_val;
519 
520             bi_linear_int_flag = ((1 == abs_cond_left_flag)
521                             && (1 == abs_cond_top_flag));
522         }
523 
524         src_4nt = pu1_src[4 * nt];
525         /* Strong filtering of reference samples */
526         if(1 == bi_linear_int_flag)
527         {
528             WORD32 two_nt = four_nt >> 1;
529 
530             WORD32 pu1_src_0_val = pu1_src[0];
531             WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
532             WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
533 
534             WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
535             uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
536 
537             WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
538             uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
539 
540             const UWORD8 *const_col_i;
541             uint8x8_t const_col_i_val;
542             uint16x8_t prod_val_1;
543             uint16x8_t prod_val_2;
544             uint16x8_t prod_val_3;
545             uint16x8_t prod_val_4;
546             uint8x8_t res_val_1;
547             uint8x8_t res_val_2;
548             uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
549             uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
550             uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
551             pu1_dst_tmp_0 = pu1_dst + 1;
552             pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
553 
554             const_col_i = gau1_ihevc_planar_factor + 1;
555 
556             for(i = two_nt; i > 0; i -= 8)
557             {
558                 const_col_i_val = vld1_u8(const_col_i);
559                 const_col_i += 8;
560 
561                 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
562                 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
563 
564                 res_val_1 = vrshrn_n_u16(prod_val_2, 6);
565                 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
566 
567                 vst1_u8(pu1_dst_tmp_0, res_val_1);
568                 pu1_dst_tmp_0 += 8;
569                 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
570 
571                 res_val_2 = vrshrn_n_u16(prod_val_4, 6);
572                 vst1_u8(pu1_dst_tmp_1, res_val_2);
573                 pu1_dst_tmp_1 += 8;
574             }
575             pu1_dst[2 * nt] = pu1_src[2 * nt];
576         }
577         else
578         {
579             pu1_src_tmp_1 = pu1_src + 1;
580             pu1_src_tmp_2 = pu1_src + 2;
581             pu1_dst_tmp_0 += 1;
582 
583             dup_const_2 = vdup_n_u8(2);
584 
585             /* Extremities Untouched*/
586             pu1_dst[0] = pu1_src[0];
587 
588             /* To avoid the issue when the dest and src has the same pointer this load has been done
589              * outside and the 2nd consecutive load is done before the store of the 1st */
590 
591             /* Perform bilinear filtering of Reference Samples */
592             for(i = (four_nt - 1); i > 0; i -= 8)
593             {
594                 src_val_0 = vld1_u8(pu1_src_tmp_0);
595                 pu1_src_tmp_0 += 8;
596 
597                 src_val_2 = vld1_u8(pu1_src_tmp_2);
598                 pu1_src_tmp_2 += 8;
599 
600                 src_val_1 = vld1_u8(pu1_src_tmp_1);
601                 pu1_src_tmp_1 += 8;
602 
603                 if(i < four_nt - 1)
604                 {
605                     vst1_u8(pu1_dst_tmp_0, shift_res);
606                     pu1_dst_tmp_0 += 8;
607                 }
608 
609                 add_res = vaddl_u8(src_val_0, src_val_2);
610 
611                 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
612                 shift_res = vrshrn_n_u16(mul_res, 2);
613 
614             }
615             vst1_u8(pu1_dst_tmp_0, shift_res);
616             pu1_dst_tmp_0 += 8;
617         }
618         pu1_dst[4 * nt] = src_4nt;
619 
620     }
621 
622 }
623 
624 
625 
626 /**
627  *******************************************************************************
628  *
629  * @brief
630 *   Intra prediction interpolation filter for luma planar
631 *
632 * @par Description:
633 *      Planar Intraprediction with reference neighboring samples  location
634 *      pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'
635 *
636 * @param[in] pu1_src
637 *  UWORD8 pointer to the source
638 *
639 * @param[out] pu1_dst
640 *  UWORD8 pointer to the destination
641 *
642 * @param[in] src_strd
643 *  integer source stride
644 *
645 * @param[in] dst_strd
646 *  integer destination stride
647 *
648 * @param[in] nt
649 *  integer Transform Block size
650 *
651 * @param[in] wd
652 *  integer width of the array
653 *
654 * @returns
655 *
656 * @remarks
657 *  None
658 *
659 *******************************************************************************
660 */
661 
ihevc_intra_pred_luma_planar_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)662 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
663                                            WORD32 src_strd,
664                                            UWORD8 *pu1_dst,
665                                            WORD32 dst_strd,
666                                            WORD32 nt,
667                                            WORD32 mode)
668 {
669     /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor)   */
670     /* load const_nt_1_col values into a d register                                                 */
671     /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1                                         */
672     /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup                 */
673     /* log2nt + 1 is taken care while assigning the values itself                                   */
674     /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
675 
676     WORD32 row, col = 0;
677     WORD32 log2nt_plus1 = 6;
678     WORD32 two_nt, three_nt;
679     UWORD8 *pu1_ref_two_nt_1;
680     UWORD8 *pu1_dst_tmp;
681     const UWORD8 *const_nt_1_col;
682     uint8x8_t const_nt_1_col_t;
683     const UWORD8 *const_col_1;
684     uint8x8_t const_col_1_t;
685     uint8_t const_nt_1_row;
686     uint8x8_t const_nt_1_row_dup;
687     uint8_t const_row_1;
688     uint8x8_t const_row_1_dup;
689     uint8_t const_nt = nt;
690     uint16x8_t const_nt_dup;
691     uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
692     uint8x8_t pu1_ref_nt_1_dup;
693     uint8_t pu1_ref_two_nt_1_row;
694     uint8_t pu1_ref_three_nt_1;
695     uint8x8_t pu1_ref_two_nt_1_row_dup;
696     uint8x8_t pu1_ref_two_nt_1_t;
697     uint8x8_t pu1_ref_three_nt_1_dup;
698     uint16x8_t prod_t1;
699     uint16x8_t prod_t2;
700     uint16x8_t sto_res_tmp;
701     uint8x8_t sto_res;
702     int16x8_t log2nt_dup;
703     UNUSED(src_strd);
704     UNUSED(mode);
705     log2nt_plus1 = 32 - CLZ(nt);
706     two_nt = 2 * nt;
707     three_nt = 3 * nt;
708     /* loops have been unrolld considering the fact width is multiple of 8  */
709     if(0 == (nt & 7))
710     {
711         pu1_dst_tmp = pu1_dst;
712         const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
713 
714         const_col_1 = gau1_ihevc_planar_factor + 1;
715         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
716 
717         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
718         const_nt_dup = vdupq_n_u16(const_nt);
719 
720         log2nt_dup = vdupq_n_s16(log2nt_plus1);
721         log2nt_dup = vnegq_s16(log2nt_dup);
722 
723         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
724 
725         for(row = 0; row < nt; row++)
726         {
727             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
728             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
729 
730             const_nt_1_row = nt - 1 - row;
731             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
732 
733             const_row_1 = row + 1;
734             const_row_1_dup = vdup_n_u8(const_row_1);
735 
736             const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
737 
738             const_col_1 = gau1_ihevc_planar_factor + 1;
739             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
740 
741             for(col = nt; col > 0; col -= 8)
742             {
743                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
744                 const_nt_1_col -= 8;
745                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
746 
747                 const_col_1_t = vld1_u8(const_col_1);
748                 const_col_1 += 8;
749                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
750 
751                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
752                 pu1_ref_two_nt_1 += 8;
753                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
754 
755                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
756                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
757                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
758                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
759 
760                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
761                 sto_res = vmovn_u16(sto_res_tmp);
762                 vst1_u8(pu1_dst_tmp, sto_res);
763                 pu1_dst_tmp += 8;
764             }
765             pu1_dst_tmp += dst_strd - nt;
766         }
767     }
768     /* loops have been unrolld considering the fact width is multiple of 4  */
769     /* If column is multiple of 4 then height should be multiple of 2       */
770     else
771     {
772         uint8x8_t const_row_1_dup1;
773         uint8x8_t pu1_ref_two_nt_1_t1;
774         uint8x8_t const_nt_1_col_t1;
775         uint8x8_t const_col_1_t1;
776         uint8x8_t pu1_ref_two_nt_1_row_dup1;
777         uint8x8_t const_nt_1_row_dup1;
778 
779         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
780 
781         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
782         const_nt_dup = vdupq_n_u16(const_nt);
783 
784         log2nt_dup = vdupq_n_s16(log2nt_plus1);
785         log2nt_dup = vnegq_s16(log2nt_dup);
786 
787         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
788 
789         for(row = 0; row < nt; row += 2)
790         {
791             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
792             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
793             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
794             pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
795             pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
796 
797             const_nt_1_row = nt - 1 - row;
798             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
799             const_nt_1_row = nt - 2 - row;
800             const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
801             const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
802 
803             const_row_1 = row + 1;
804             const_row_1_dup = vdup_n_u8(const_row_1);
805             const_row_1 = row + 2;
806             const_row_1_dup1 = vdup_n_u8(const_row_1);
807             const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
808 
809             const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
810 
811             const_col_1 = gau1_ihevc_planar_factor + 1;
812 
813             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
814 
815             for(col = nt; col > 0; col -= 4)
816             {
817                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
818                 const_nt_1_col -= 4;
819                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
820 
821                 const_col_1_t = vld1_u8(const_col_1);
822                 const_col_1 += 4;
823                 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
824 
825                 pu1_dst_tmp = pu1_dst;
826                 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
827 
828                 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
829                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
830 
831                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
832                 pu1_ref_two_nt_1 += 4;
833                 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
834 
835                 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
836                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
837 
838                 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
839                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
840 
841                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
842                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
843                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
844 
845                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
846                 sto_res = vmovn_u16(sto_res_tmp);
847 
848                 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
849                 pu1_dst_tmp += dst_strd;
850 
851                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
852                 pu1_dst += 4;
853             }
854             pu1_dst += 2 * dst_strd - nt;
855         }
856     }
857 
858 }
859 /* INTRA_PRED_LUMA_PLANAR */
860 
861 /**
862 *******************************************************************************
863 *
864 * @brief
865 *    Intra prediction interpolation filter for luma dc
866 *
867 * @par Description:
868 *    Intraprediction for DC mode with reference neighboring  samples location
869 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
870 *
871 * @param[in] pu1_src
872 *  UWORD8 pointer to the source
873 *
874 * @param[out] pu1_dst
875 *  UWORD8 pointer to the destination
876 *
877 * @param[in] src_strd
878 *  integer source stride
879 *
880 * @param[in] dst_strd
881 *  integer destination stride
882 *
883 * @param[in] nt
884 *  integer Transform Block size
885 *
886 * @param[in] wd
887 *  integer width of the array
888 *
889 * @returns
890 *
891 * @remarks
892 *  None
893 *
894 *******************************************************************************
895 */
896 
ihevc_intra_pred_luma_dc_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)897 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
898                                        WORD32 src_strd,
899                                        UWORD8 *pu1_dst,
900                                        WORD32 dst_strd,
901                                        WORD32 nt,
902                                        WORD32 mode)
903 {
904     WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
905     WORD32 i = 0;
906     WORD32 row = 0, col = 0, col_count;
907     WORD32 log2nt_plus1 = 6;
908     WORD32 two_nt = 0;
909     uint16x8_t ref_load_q;
910     uint16x8_t three_dc_val_t;
911     uint8x8_t sto_res_tmp;
912     uint8x8_t sto_res_tmp1;
913     uint8x8_t sto_res_tmp2;
914     uint8x8_t sto_res_tmp3;
915     uint8x8_t sto_res_tmp4;
916     uint8x8_t dc_val_t;
917 
918     UWORD8 *pu1_ref_tmp;
919     UWORD8 *pu1_ref_tmp1;
920     UWORD8 *pu1_dst_tmp;
921     UWORD8 *pu1_dst_tmp1;
922     UWORD8 *pu1_dst_tmp2;
923     UNUSED(src_strd);
924     UNUSED(mode);
925 
926     /* log2nt + 1 is taken care while assigning the values itself.          */
927     log2nt_plus1 = 32 - CLZ(nt);
928 
929     /* loops have been unrolld considering the fact width is multiple of 8  */
930     if(0 == (nt & 7))
931     {
932         uint8x8_t ref_load1;
933         uint8x8_t ref_load2;
934         uint16x4_t acc_dc_pair1;
935         uint32x2_t acc_dc_pair2;
936         uint64x1_t acc_dc = vdup_n_u64(col);
937 
938         two_nt = 2 * nt;
939         pu1_ref_tmp = pu1_ref + nt;
940         pu1_ref_tmp1 = pu1_ref + two_nt + 1;
941 
942         for(i = two_nt; i > nt; i -= 8)
943         {
944             ref_load1 = vld1_u8(pu1_ref_tmp);
945             pu1_ref_tmp += 8;
946             acc_dc_pair1 = vpaddl_u8(ref_load1);
947 
948             ref_load2 = vld1_u8(pu1_ref_tmp1);
949             pu1_ref_tmp1 += 8;
950 
951             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
952             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
953 
954             acc_dc_pair1 = vpaddl_u8(ref_load2);
955             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
956             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
957         }
958 
959         dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
960         dc_val_t = vdup_n_u8(dc_val);
961         two_dc_val = 2 * dc_val;
962         three_dc_val = 3 * dc_val;
963         three_dc_val += 2;
964 
965         three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
966         pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
967         pu1_dst_tmp = pu1_dst;
968 
969 
970         if(nt == 32)
971         {
972             for(row = 0; row < nt; row++)
973             {
974                 for(col = nt; col > 0; col -= 8)
975                 {
976                     vst1_u8(pu1_dst_tmp, dc_val_t);
977                     pu1_dst_tmp += 8;
978                 }
979                 pu1_dst_tmp += dst_strd - nt;
980             }
981         }
982         else
983 
984         {
985             for(col = nt; col > 0; col -= 8)
986             {
987                 ref_load1 = vld1_u8(pu1_ref_tmp);
988                 pu1_ref_tmp += 8;
989                 ref_load_q = vmovl_u8(ref_load1);
990                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
991                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
992                 sto_res_tmp = vmovn_u16(ref_load_q);
993                 vst1_u8(pu1_dst_tmp, sto_res_tmp);
994                 pu1_dst_tmp += 8;
995             }
996 
997             pu1_ref_tmp = pu1_ref + two_nt - 9;
998             pu1_dst_tmp = pu1_dst + dst_strd;
999             col_count = nt - 8;
1000 
1001             /* Except the first row the remaining rows are done here                            */
1002             /* Both column and row has been unrolled by 8                                       */
1003             /* Store has been taken care for the unrolling                                      */
1004             /* Except the 1st column of the remaining rows(other than 1st row), the values are  */
1005             /* constant hence it is extracted with an constant value and stored                 */
1006             /* If the column is greater than 8, then the remaining values are constant which is */
1007             /* taken care in the inner for loop                                                 */
1008 
1009             for(row = nt; row > 0; row -= 8)
1010             {
1011                 pu1_dst_tmp1 = pu1_dst_tmp + 8;
1012                 ref_load1 = vld1_u8(pu1_ref_tmp);
1013                 pu1_ref_tmp -= 8;
1014                 ref_load_q = vmovl_u8(ref_load1);
1015                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
1016                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
1017                 sto_res_tmp = vmovn_u16(ref_load_q);
1018 
1019                 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
1020 
1021                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
1022                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1023                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1024                 pu1_dst_tmp += dst_strd;
1025 
1026                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
1027                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1028                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1029                 pu1_dst_tmp += dst_strd;
1030 
1031                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
1032                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1033                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1034                 pu1_dst_tmp += dst_strd;
1035 
1036                 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
1037                 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
1038                 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1039                 pu1_dst_tmp += dst_strd;
1040 
1041                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
1042                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1043                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1044                 pu1_dst_tmp += dst_strd;
1045 
1046                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
1047                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1048                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1049                 pu1_dst_tmp += dst_strd;
1050 
1051                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
1052                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1053                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1054                 pu1_dst_tmp += dst_strd;
1055                 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1056                 if(row != 8)
1057                     vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1058                 pu1_dst_tmp += dst_strd;
1059 
1060                 for(col = col_count; col > 0; col -= 8)
1061                 {
1062                     pu1_dst_tmp2 = pu1_dst_tmp1;
1063                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1064                     pu1_dst_tmp1 += dst_strd;
1065                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1066                     pu1_dst_tmp1 += dst_strd;
1067                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1068                     pu1_dst_tmp1 += dst_strd;
1069                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1070                     pu1_dst_tmp1 += dst_strd;
1071                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1072                     pu1_dst_tmp1 += dst_strd;
1073                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1074                     pu1_dst_tmp1 += dst_strd;
1075                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1076                     pu1_dst_tmp1 += dst_strd;
1077 
1078                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1079                     if(row != 8)
1080                         vst1_u8(pu1_dst_tmp1, dc_val_t);
1081                     pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
1082                 }
1083             }
1084             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1085         }
1086     }
1087     /* loops have been unrolld considering the fact width is multiple of 4  */
1088     else
1089     {
1090         WORD32 acc_dc;
1091         two_nt = 2 * nt;
1092 
1093         acc_dc = 0;
1094         pu1_ref_tmp = pu1_ref + nt + 1;
1095         for(i = nt; i < two_nt; i++)
1096         {
1097             acc_dc += pu1_ref[i];
1098             acc_dc += pu1_ref_tmp[i];
1099         }
1100         dc_val = (acc_dc + nt) >> (log2nt_plus1);
1101         two_dc_val = 2 * dc_val;
1102         three_dc_val = 3 * dc_val;
1103         three_dc_val = three_dc_val + 2;
1104         dc_val_t = vdup_n_u8(dc_val);
1105 
1106         if(nt == 32)
1107         {
1108             pu1_dst_tmp = pu1_dst;
1109             for(row = 0; row < nt; row++)
1110             {
1111                 for(col = nt; col > 0; col -= 4)
1112                 {
1113                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1114                     pu1_dst_tmp += 4;
1115                 }
1116                 pu1_dst_tmp += dst_strd - nt;
1117             }
1118         }
1119         else
1120 
1121         {
1122             for(col = 1; col < nt; col++)
1123             {
1124                 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
1125             }
1126 
1127             pu1_dst_tmp = pu1_dst + dst_strd + 0;
1128             /* Since first row is already updated before, loop count is nt-1 */
1129             for(row = nt - 1; row > 0; row -= 1)
1130             {
1131                 for(col = nt; col > 0; col -= 4)
1132                 {
1133                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1134                     pu1_dst_tmp += 4;
1135                 }
1136                 pu1_dst_tmp += dst_strd - nt;
1137             }
1138 
1139             for(row = 1; row < nt; row++)
1140             {
1141                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
1142             }
1143             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1144         }
1145     }
1146 }
1147 /* INTRA_PRED_LUMA_DC */
1148 
1149 /**
1150 *******************************************************************************
1151 *
1152 * @brief
1153  *   Intra prediction interpolation filter for horizontal luma variable.
1154  *
1155  * @par Description:
1156  *   Horizontal intraprediction with reference neighboring  samples location
1157  *   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
1158  *
1159  * @param[in] pu1_src
1160  *  UWORD8 pointer to the source
1161  *
1162  * @param[out] pu1_dst
1163  *  UWORD8 pointer to the destination
1164  *
1165  * @param[in] src_strd
1166  *  integer source stride
1167  *
1168  * @param[in] dst_strd
1169  *  integer destination stride
1170  *
1171  * @param[in] nt
1172  *  integer Transform Block size
1173  *
1174  * @param[in] wd
1175  *  integer width of the array
1176  *
1177  * @returns
1178  *
1179  * @remarks
1180  *  None
1181  *
1182  *******************************************************************************
1183  */
1184 
ihevc_intra_pred_luma_horz_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1185 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
1186                                          WORD32 src_strd,
1187                                          UWORD8 *pu1_dst,
1188                                          WORD32 dst_strd,
1189                                          WORD32 nt,
1190                                          WORD32 mode)
1191 {
1192 
1193     WORD32 row, col;
1194     WORD32 two_nt;
1195     UNUSED(src_strd);
1196     UNUSED(mode);
1197 
1198     two_nt = 2 * nt;
1199 
1200 
1201     UWORD8 *pu1_dst_tmp = pu1_dst;
1202     UWORD32 pu1_val;
1203     uint8x8_t pu1_val_two_nt_1_row;
1204     if(nt == 32)
1205     {
1206         pu1_dst_tmp = pu1_dst;
1207         for(row = 0; row < nt; row++)
1208         {
1209             pu1_val = pu1_ref[two_nt - 1 - row];
1210             pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
1211             for(col = nt; col > 0; col -= 8)
1212             {
1213                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
1214                 pu1_dst_tmp += 8;
1215             }
1216             pu1_dst_tmp += dst_strd - nt;
1217         }
1218     }
1219     else
1220 
1221 
1222     /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
1223     /* naming of variables made according to the operation(instructions) it performs*/
1224     /* (eg. shift_val which contains the shifted value,                             */
1225     /* add_sat which has add and saturated value)                                   */
1226     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1227     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1228     {
1229         if(0 != (nt & 7))      /* cond for multiple of 4 */
1230         {
1231             UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
1232             UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
1233             UWORD8 *pu1_dst_4 = pu1_dst;
1234             UWORD8 *pu1_dst_4_tmp = pu1_dst;
1235 
1236             uint32x2_t pu1_ref_val1, pu1_ref_val2;
1237             uint8x8_t dup_sub, round_val, dup_val;
1238             uint16x8_t dup_add, sub_val;
1239             int16x8_t shift_val, add_sat;
1240 
1241             pu1_ref_val1 = vdup_n_u32(0);
1242             pu1_ref_val2 = vdup_n_u32(0);
1243 
1244             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1245 
1246             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1247 
1248             pu1_ref_4_two_nt_plus1 += (two_nt + 1);
1249 
1250             pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
1251 
1252             for(row = nt; row > 0; row -= 4)
1253             {
1254                 for(col = nt; col > 0; col -= 4)
1255                 {
1256                     pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
1257                     sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
1258                     shift_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1259 
1260                     add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
1261                     round_val = vqmovun_s16(add_sat);
1262                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
1263                     pu1_dst_4 += dst_strd;
1264 
1265                     pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
1266                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
1267                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1268                     pu1_dst_4 += dst_strd;
1269 
1270                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
1271                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1272                     pu1_dst_4 += dst_strd;
1273 
1274                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
1275                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1276                     pu1_dst_4 += dst_strd;
1277 
1278 
1279                 }
1280                 /* worst cases */
1281                 pu1_ref_4_two_nt_minus_nt += 3;
1282                 pu1_ref_4_two_nt_plus1 += 4;
1283                 pu1_dst_4 = (pu1_dst_4_tmp + 4);
1284             }
1285 
1286         }
1287 
1288         /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
1289         /* naming of variables made according to the operation(instructions) it performs    */
1290         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1291         /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
1292 
1293         else
1294         {
1295             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1296             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1297 
1298             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1299             UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
1300             UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
1301 
1302             uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
1303             uint16x8_t sub_res, dup_add;
1304             int16x8_t shift_res, add_res;
1305 
1306             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1307             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1308 
1309             pu1_ref_tmp_1 += (two_nt + 1);
1310             pu1_ref_tmp_2 += (two_nt - 1);
1311 
1312             for(col = nt; col > 0; col -= 8)
1313             {
1314                 src_tmp = vld1_u8(pu1_ref_tmp_1);
1315                 pu1_ref_tmp_1 += 8;
1316 
1317                 sub_res = vsubl_u8(src_tmp, dup_sub);
1318                 shift_res  = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
1319                 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
1320                 round_val = vqmovun_s16(add_res);
1321                 vst1_u8(pu1_dst_tmp_1, round_val);
1322                 pu1_dst_tmp_1 += 8;
1323             }
1324 
1325             for(row = nt; row > 0; row -= 8)
1326             {
1327                 pu1_ref_tmp_2 -= 8;
1328 
1329                 src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
1330                 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
1331 
1332                 dup_1 = vdup_lane_u8(rev_res, 0);
1333                 dup_2 = vdup_lane_u8(rev_res, 1);
1334                 dup_3 = vdup_lane_u8(rev_res, 2);
1335                 dup_4 = vdup_lane_u8(rev_res, 3);
1336                 dup_5 = vdup_lane_u8(rev_res, 4);
1337                 dup_6 = vdup_lane_u8(rev_res, 5);
1338                 dup_7 = vdup_lane_u8(rev_res, 6);
1339                 dup_8 = vdup_lane_u8(rev_res, 7);
1340 
1341                 for(col = nt; col > 0; col -= 8)
1342                 {
1343                     pu1_dst_tmp_2 = pu1_dst_tmp_3;
1344 
1345                     vst1_u8(pu1_dst_tmp_2, dup_1);
1346                     pu1_dst_tmp_2 += dst_strd;
1347 
1348                     vst1_u8(pu1_dst_tmp_2, dup_2);
1349                     pu1_dst_tmp_2 += dst_strd;
1350 
1351                     vst1_u8(pu1_dst_tmp_2, dup_3);
1352                     pu1_dst_tmp_2 += dst_strd;
1353 
1354                     vst1_u8(pu1_dst_tmp_2, dup_4);
1355                     pu1_dst_tmp_2 += dst_strd;
1356 
1357                     vst1_u8(pu1_dst_tmp_2, dup_5);
1358                     pu1_dst_tmp_2 += dst_strd;
1359 
1360                     vst1_u8(pu1_dst_tmp_2, dup_6);
1361                     pu1_dst_tmp_2 += dst_strd;
1362 
1363                     vst1_u8(pu1_dst_tmp_2, dup_7);
1364                     pu1_dst_tmp_2 += dst_strd;
1365 
1366                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1367                     if(row != 8)
1368                         vst1_u8(pu1_dst_tmp_2, dup_8);
1369                     pu1_dst_tmp_2 += dst_strd;
1370 
1371                     pu1_dst_tmp_3 += 8;
1372                 }
1373                 pu1_dst_tmp_2 -= (nt - 8);
1374                 pu1_dst_tmp_3 = pu1_dst_tmp_2;
1375             }
1376         }
1377     }
1378 }
1379 /* INTRA_PRED_LUMA_HORZ */
1380 
1381 /**
1382 *******************************************************************************
1383 *
1384 * @brief
1385 *    Intra prediction interpolation filter for vertical luma variable.
1386 *
1387 * @par Description:
1388 *    Horizontal intraprediction with reference neighboring  samples location
1389 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
1390 *
1391 * @param[in] pu1_src
1392 *  UWORD8 pointer to the source
1393 *
1394 * @param[out] pu1_dst
1395 *  UWORD8 pointer to the destination
1396 *
1397 * @param[in] src_strd
1398 *  integer source stride
1399 *
1400 * @param[in] dst_strd
1401 *  integer destination stride
1402 *
1403 * @param[in] nt
1404 *  integer Transform Block size
1405 *
1406 * @param[in] wd
1407 *  integer width of the array
1408 *
1409 * @returns
1410 *
1411 * @remarks
1412 *  None
1413 *
1414 *******************************************************************************
1415 */
1416 
ihevc_intra_pred_luma_ver_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1417 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
1418                                         WORD32 src_strd,
1419                                         UWORD8 *pu1_dst,
1420                                         WORD32 dst_strd,
1421                                         WORD32 nt,
1422                                         WORD32 mode)
1423 {
1424     WORD32 row, col;
1425     WORD32 two_nt;
1426     UNUSED(src_strd);
1427     UNUSED(mode);
1428 
1429     two_nt = 2 * nt;
1430 
1431     UWORD8 *pu1_dst_tmp = pu1_dst;
1432     UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
1433     uint8x8_t pu1_val_two_nt_1_col;
1434     if(nt == 32)
1435     {
1436         pu1_dst_tmp = pu1_dst;
1437         for(row = 0; row < nt; row++)
1438         {
1439             for(col = nt; col > 0; col -= 8)
1440             {
1441                 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
1442                 pu1_ref_tmp_1 += 8;
1443                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
1444                 pu1_dst_tmp += 8;
1445             }
1446             pu1_ref_tmp_1 -= nt;
1447             pu1_dst_tmp += dst_strd - nt;
1448         }
1449     }
1450     else
1451 
1452     {
1453         /* naming of variables made according to the operation(instructions) it performs                    */
1454         /* (eg. shift_val which contains the shifted value,                                                 */
1455         /* add_sat which has add and saturated value)                                                       */
1456         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1457         /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1458 
1459         if(0 != (nt & 7))
1460         {
1461             WORD32 cond_4 = 0;
1462             UWORD8 *pu1_ref_val1 = pu1_ref;
1463             UWORD8 *pu1_ref_val2 = pu1_ref;
1464             UWORD8 *pu1_ref_val3 = pu1_ref;
1465 
1466             UWORD8 *pu1_dst_val1 = pu1_dst;
1467             UWORD8 *pu1_dst_val2 = pu1_dst;
1468             UWORD8 *pu1_dst_val3 = pu1_dst;
1469 
1470             uint8x8_t dup_2_sub, round_val, vext_val;
1471             uint16x8_t dup_2_add;
1472             uint32x2_t src_val1, src_val2, src_val3;
1473             uint16x8_t sub_val;
1474             int16x8_t shift_val1, add_sat;
1475             uint64x1_t shift_val2;
1476 
1477             src_val1 = vdup_n_u32(0);
1478             src_val2 = vdup_n_u32(0);
1479             src_val3 = vdup_n_u32(0);
1480             pu1_ref_val1 += (two_nt - nt);
1481             pu1_ref_val3 += (two_nt + 2);
1482             pu1_ref_val2 += (two_nt + 1);
1483 
1484             dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
1485             dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1486 
1487             /* loops to store the first nt sets of values in the destination */
1488 
1489             for(row = nt; row > 0; row -= 4)
1490             {
1491                 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
1492                 {
1493                     /*  unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
1494                     src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
1495                     sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
1496                     shift_val1  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1497                     add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
1498                     round_val = vqmovun_s16(add_sat);
1499 
1500                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1501                     src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
1502                     vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
1503                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1504                     pu1_dst_val1 += dst_strd;
1505 
1506                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1507 
1508                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1509                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1510                     pu1_dst_val1 += dst_strd;
1511 
1512                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1513 
1514                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1515                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1516                     pu1_dst_val1 += dst_strd;
1517 
1518                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1519 
1520                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1521                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1522                     pu1_dst_val1 += dst_strd;
1523 
1524                     pu1_ref_val1  -= 4;
1525                 }
1526 
1527                 /* loop to store next sets of eight values in the destination */
1528 
1529                 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
1530                 {
1531                     src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
1532 
1533                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1534                     pu1_dst_val2 += dst_strd;
1535 
1536                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1537                     pu1_dst_val2 += dst_strd;
1538 
1539                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1540                     pu1_dst_val2 += dst_strd;
1541 
1542                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1543                     pu1_dst_val2 += dst_strd;
1544                 }
1545                 pu1_ref_val2 += 4;
1546                 pu1_dst_val3 += 4;
1547                 pu1_dst_val2 = pu1_dst_val3;
1548                 cond_4 = 1;
1549             }
1550         }
1551 
1552         /* rows and columns are unrolled by 8, when the width is multiple of 8          */
1553         else
1554         {
1555             WORD32 cond = 0, col_1;
1556             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1557             UWORD8 *pu1_dst_tmp_2 = pu1_dst;
1558             UWORD8 *pu1_dst_tmp_3 = pu1_dst;
1559 
1560             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1561             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1562             UWORD8 *pu1_ref_tmp_3 = pu1_ref;
1563 
1564             uint8x8_t pu1_src_tmp1;
1565             uint8x8_t pu1_src_tmp2;
1566 
1567             uint8x8_t dup_sub;
1568             uint16x8_t dup_add;
1569             int16x8_t subsh_val;
1570             int16x8_t addsat_val;
1571             uint16x8_t sub_val;
1572             uint8x8_t round_val;
1573             uint8x8_t vext_t;
1574             uint64x1_t shift_64;
1575 
1576             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1577             dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1578 
1579             pu1_ref_tmp_1 += (two_nt);
1580             pu1_ref_tmp_1 -= 8;
1581             pu1_ref_tmp_2 += (two_nt + 2);
1582             pu1_ref_tmp_3 += (two_nt + 1);
1583 
1584             /* loops to store the first nt sets of values in the destination */
1585 
1586             for(row = nt; row > 0; row -= 8)
1587             {
1588                 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
1589                 {
1590                     pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
1591 
1592                     sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
1593                     subsh_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1594                     addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
1595                     round_val = vqmovun_s16(addsat_val);
1596 
1597                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1598 
1599                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
1600                     vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
1601                     vst1_u8(pu1_dst_tmp_1, vext_t);
1602                     pu1_dst_tmp_1 += dst_strd;
1603 
1604                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1605 
1606                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1607                     vst1_u8(pu1_dst_tmp_1, vext_t);
1608                     pu1_dst_tmp_1 += dst_strd;
1609 
1610                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1611                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1612                     vst1_u8(pu1_dst_tmp_1, vext_t);
1613                     pu1_dst_tmp_1 += dst_strd;
1614 
1615                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1616                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1617                     vst1_u8(pu1_dst_tmp_1, vext_t);
1618                     pu1_dst_tmp_1 += dst_strd;
1619 
1620                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
1621                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1622                     vst1_u8(pu1_dst_tmp_1, vext_t);
1623                     pu1_dst_tmp_1 += dst_strd;
1624 
1625                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
1626                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1627                     vst1_u8(pu1_dst_tmp_1, vext_t);
1628                     pu1_dst_tmp_1 += dst_strd;
1629 
1630                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
1631                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1632                     vst1_u8(pu1_dst_tmp_1, vext_t);
1633                     pu1_dst_tmp_1 += dst_strd;
1634 
1635                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
1636                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1637                     vst1_u8(pu1_dst_tmp_1, vext_t);
1638                     pu1_dst_tmp_1 += dst_strd;
1639 
1640                     pu1_ref_tmp_1 -= 8;
1641                 }
1642 
1643                 /* loop to store next sets of eight values in the destination */
1644 
1645                 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
1646                 {
1647                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
1648 
1649                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1650                     pu1_dst_tmp_2 += dst_strd;
1651 
1652                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1653                     pu1_dst_tmp_2 += dst_strd;
1654 
1655                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1656                     pu1_dst_tmp_2 += dst_strd;
1657 
1658                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1659                     pu1_dst_tmp_2 += dst_strd;
1660 
1661                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1662                     pu1_dst_tmp_2 += dst_strd;
1663 
1664                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1665                     pu1_dst_tmp_2 += dst_strd;
1666 
1667                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1668                     pu1_dst_tmp_2 += dst_strd;
1669 
1670                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1671                     pu1_dst_tmp_2 += dst_strd;
1672                 }
1673                 pu1_ref_tmp_3 += 8;
1674                 pu1_dst_tmp_3 += 8;
1675                 pu1_dst_tmp_2 = pu1_dst_tmp_3;
1676                 cond = 1;
1677             }
1678         }
1679     }
1680 }
1681 /* INTRA_PRED_LUMA_VER */
1682 
1683 /**
1684 *******************************************************************************
1685 *
1686 * @brief
1687 *    Intra prediction interpolation filter for luma mode2.
1688 *
1689 * @par Description:
1690 *    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
1691 *    location pointed by 'pu1_ref' to the  TU block location pointed by
1692 *    'pu1_dst'
1693 *
1694 * @param[in] pu1_src
1695 *  UWORD8 pointer to the source
1696 *
1697 * @param[out] pu1_dst
1698 *  UWORD8 pointer to the destination
1699 *
1700 * @param[in] src_strd
1701 *  integer source stride
1702 *
1703 * @param[in] dst_strd
1704 *  integer destination stride
1705 *
1706 * @param[in] nt
1707 *  integer Transform Block size
1708 *
1709 * @param[in] wd
1710 *  integer width of the array
1711 *
1712 * @returns
1713 *
1714 * @remarks
1715 *  None
1716 *
1717 *******************************************************************************
1718 */
1719 
ihevc_intra_pred_luma_mode2_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1720 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
1721                                           WORD32 src_strd,
1722                                           UWORD8 *pu1_dst,
1723                                           WORD32 dst_strd,
1724                                           WORD32 nt,
1725                                           WORD32 mode)
1726 {
1727 
1728     WORD32 row, col;
1729     WORD32 two_nt;
1730     UNUSED(src_strd);
1731     UNUSED(mode);
1732 
1733     /* rev_res naming has been made to have the reverse result value in it                              */
1734     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1735     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1736 
1737     if(0 != (nt & 7))
1738     {
1739         UWORD8 *pu1_ref_tmp = pu1_ref;
1740         UWORD8 *pu1_dst_tmp = pu1_dst;
1741         uint8x8_t pu1_src_val, rev_res;
1742         uint64x1_t shift_res;
1743 
1744         for(col = nt; col > 0; col -= 4)
1745         {
1746             for(row = nt; row > 0; row -= 4)
1747             {
1748                 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
1749 
1750                 pu1_src_val = vld1_u8(pu1_ref_tmp);
1751                 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
1752                 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
1753 
1754                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
1755                 pu1_dst_tmp += dst_strd;
1756 
1757                 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
1758                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1759                 pu1_dst_tmp += dst_strd;
1760 
1761                 shift_res = vshr_n_u64(shift_res, 8);
1762                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1763                 pu1_dst_tmp += dst_strd;
1764 
1765                 shift_res = vshr_n_u64(shift_res, 8);
1766                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1767                 pu1_dst_tmp += dst_strd;
1768             }
1769         }
1770     }
1771 
1772     /* rev_val_second, rev_val_first  to reverse the loaded values in order to get the values in right order */
1773     /* shift_64 to shift the reversed 2nd values to get the value what we need                               */
1774     /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
1775 
1776     else
1777     {
1778         UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
1779         UWORD8 *pu1_dst_tmp = pu1_dst;
1780         UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1781 
1782         uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
1783         uint64x1_t shift_val;
1784 
1785         two_nt = 2 * nt;
1786         pu1_ref_two_nt_minus2 += (two_nt);
1787         pu1_ref_two_nt_minus2 -= 8;
1788 
1789         for(col = nt; col > 0; col -= 8)
1790         {
1791             for(row = nt; row > 0; row -= 8)
1792             {
1793                 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
1794                 rev_val_first = vrev64_u8(pu1_src_val2);
1795 
1796                 pu1_ref_two_nt_minus2 -= 8;
1797                 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
1798                 rev_val_second = vrev64_u8(pu1_src_val1);
1799 
1800                 vext_t = vext_u8(rev_val_first, rev_val_second, 1);
1801                 vst1_u8(pu1_dst_tmp, vext_t);
1802                 pu1_dst_tmp += dst_strd;
1803 
1804                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
1805                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1806                 vst1_u8(pu1_dst_tmp, vext_t);
1807                 pu1_dst_tmp += dst_strd;
1808 
1809                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
1810                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1811                 vst1_u8(pu1_dst_tmp, vext_t);
1812                 pu1_dst_tmp += dst_strd;
1813 
1814                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
1815                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1816                 vst1_u8(pu1_dst_tmp, vext_t);
1817                 pu1_dst_tmp += dst_strd;
1818 
1819                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
1820                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1821                 vst1_u8(pu1_dst_tmp, vext_t);
1822                 pu1_dst_tmp += dst_strd;
1823 
1824                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
1825                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1826                 vst1_u8(pu1_dst_tmp, vext_t);
1827                 pu1_dst_tmp += dst_strd;
1828 
1829                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
1830                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1831                 vst1_u8(pu1_dst_tmp, vext_t);
1832                 pu1_dst_tmp += dst_strd;
1833 
1834                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
1835                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1836                 vst1_u8(pu1_dst_tmp, vext_t);
1837                 pu1_dst_tmp += dst_strd;
1838             }
1839             pu1_dst_tmp_plus8 += 8;
1840             pu1_dst_tmp = pu1_dst_tmp_plus8;
1841             pu1_ref_two_nt_minus2 += (nt - 8);
1842         }
1843     }
1844 }
1845 /* INTRA_PRED_LUMA_MODE2 */
1846 
1847 /**
1848 *******************************************************************************
1849 *
1850 * @brief
1851 *   Intra prediction interpolation filter for luma mode 18 & mode 34.
1852 *
1853 * @par Description:
1854 *    Intraprediction for mode 34 (ne angle) with reference  neighboring
1855 *    samples location pointed by 'pu1_ref' to the  TU block location pointed by
1856 *    'pu1_dst'
1857 *
1858 * @param[in] pu1_src
1859 *  UWORD8 pointer to the source
1860 *
1861 * @param[out] pu1_dst
1862 *  UWORD8 pointer to the destination
1863 *
1864 * @param[in] src_strd
1865 *  integer source stride
1866 *
1867 * @param[in] dst_strd
1868 *  integer destination stride
1869 *
1870 * @param[in] nt
1871 *  integer Transform Block size
1872 *
1873 * @param[in] wd
1874 *  integer width of the array
1875 *
1876 * @returns
1877 *
1878 * @remarks
1879 *  None
1880 *
1881 *******************************************************************************
1882 */
1883 
ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1884 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
1885                                                WORD32 src_strd,
1886                                                UWORD8 *pu1_dst,
1887                                                WORD32 dst_strd,
1888                                                WORD32 nt,
1889                                                WORD32 mode)
1890 {
1891 
1892     WORD32 row, col, idx;
1893     WORD32 intraPredAngle = 32;
1894     WORD32 two_nt;
1895     UNUSED(src_strd);
1896     two_nt = 2 * nt;
1897 
1898     UWORD8 *pu1_ref_tmp = pu1_ref;
1899     UWORD8 *pu1_ref_tmp1 = pu1_ref;
1900     UWORD8 *pu1_dst_tmp = pu1_dst;
1901     UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1902 
1903     uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
1904 
1905     /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref)   */
1906     /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue        */
1907     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8      */
1908     /* rows and columns are unrolled by 8, when the width is multiple of 8                                  */
1909     /* loops are maintained separately for mode18 and mode34                                                */
1910 
1911     /* cond to allow multiples of 8 */
1912     if(0 == (nt & 7))
1913     {
1914         if(mode == 34)
1915         {
1916             pu1_ref_tmp += (two_nt + 2);
1917 
1918             for(row = nt; row > 0; row -= 8)
1919             {
1920                 for(col = nt; col > 0; col -= 8)
1921                 {
1922                     /* Loading 1st eight values */
1923                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
1924                     pu1_ref_tmp += 8;
1925 
1926                     /* Loading next eight values */
1927                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1928 
1929                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1930                     vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
1931                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
1932                     pu1_dst_tmp += dst_strd;
1933 
1934                     vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
1935                     vst1_u8(pu1_dst_tmp, vext1);
1936                     pu1_dst_tmp += dst_strd;
1937 
1938                     vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
1939                     vst1_u8(pu1_dst_tmp, vext2);
1940                     pu1_dst_tmp += dst_strd;
1941 
1942                     vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
1943                     vst1_u8(pu1_dst_tmp, vext3);
1944                     pu1_dst_tmp += dst_strd;
1945 
1946                     vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
1947                     vst1_u8(pu1_dst_tmp, vext4);
1948                     pu1_dst_tmp += dst_strd;
1949 
1950                     vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
1951                     vst1_u8(pu1_dst_tmp, vext5);
1952                     pu1_dst_tmp += dst_strd;
1953 
1954                     vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
1955                     vst1_u8(pu1_dst_tmp, vext6);
1956                     pu1_dst_tmp += dst_strd;
1957 
1958                     vst1_u8(pu1_dst_tmp, vext7);
1959                     pu1_dst_tmp += dst_strd;
1960                 }
1961 
1962                 pu1_dst_tmp_plus8 += 8;
1963                 pu1_dst_tmp = pu1_dst_tmp_plus8;
1964                 pu1_ref_tmp -= (nt - 8);
1965             }
1966         }
1967         else /* Loop for mode 18 */
1968         {
1969             pu1_ref_tmp += (two_nt);
1970 
1971             for(row = nt; row > 0; row -= 8)
1972             {
1973                 for(col = nt; col > 0; col -= 8)
1974                 {
1975                     /* Loading 1st eight values */
1976                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
1977                     pu1_ref_tmp -= 8;
1978 
1979                     /* Loading next eight values */
1980                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1981 
1982                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1983                     vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
1984                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
1985                     pu1_dst_tmp += dst_strd;
1986 
1987                     vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
1988                     vst1_u8(pu1_dst_tmp, vext1);
1989                     pu1_dst_tmp += dst_strd;
1990 
1991                     vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
1992                     vst1_u8(pu1_dst_tmp, vext2);
1993                     pu1_dst_tmp += dst_strd;
1994 
1995                     vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
1996                     vst1_u8(pu1_dst_tmp, vext3);
1997                     pu1_dst_tmp += dst_strd;
1998 
1999                     vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
2000                     vst1_u8(pu1_dst_tmp, vext4);
2001                     pu1_dst_tmp += dst_strd;
2002 
2003                     vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
2004                     vst1_u8(pu1_dst_tmp, vext5);
2005                     pu1_dst_tmp += dst_strd;
2006 
2007                     vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
2008                     vst1_u8(pu1_dst_tmp, vext6);
2009                     pu1_dst_tmp += dst_strd;
2010 
2011                     vst1_u8(pu1_dst_tmp, vext7);
2012                     pu1_dst_tmp += dst_strd;
2013                 }
2014                 pu1_dst_tmp_plus8 += 8;
2015                 pu1_dst_tmp = pu1_dst_tmp_plus8;
2016                 pu1_ref_tmp += (nt + 8);
2017             }
2018         }
2019     }
2020 
2021     /* rows and columns are unrolled by 4, when the width is multiple of 4  */
2022 
2023     else /* loop for multiples of 4 */
2024     {
2025         uint8x8_t src_val1;
2026         uint8x8_t src_val2;
2027 
2028         if(mode == 18)
2029             intraPredAngle = -32;
2030         else if(mode == 34)
2031             intraPredAngle = 32;
2032 
2033         for(row = 0; row < nt; row += 2)
2034         {
2035             /* unrolling 2 rows */
2036             idx = ((row + 1) * intraPredAngle) >> 5;
2037             pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
2038             src_val1 = vld1_u8(pu1_ref_tmp);
2039 
2040             idx = ((row + 2) * intraPredAngle) >> 5;
2041             pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
2042             src_val2 = vld1_u8(pu1_ref_tmp1);
2043 
2044             /* unrolling 4 col */
2045             for(col = nt; col > 0; col -= 4)
2046             {
2047                 pu1_dst_tmp = pu1_dst;
2048                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
2049                 pu1_dst_tmp += dst_strd;
2050                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
2051                 pu1_dst += 4;
2052             }
2053             pu1_dst += 2 * dst_strd - nt;
2054         }
2055     }
2056 }
2057 /* INTRA_PRED_LUMA_MODE_18_34 */
2058 
2059 /**
2060  *******************************************************************************
2061  *
2062  * @brief
2063  *    Intra prediction interpolation filter for luma mode 3 to mode 9
2064  *
2065  * @par Description:
2066  *    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
2067  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2068  *    block location pointed by 'pu1_dst'
2069  *
2070  * @param[in] pu1_src
2071  *  UWORD8 pointer to the source
2072  *
2073  * @param[out] pu1_dst
2074  *  UWORD8 pointer to the destination
2075  *
2076  * @param[in] src_strd
2077  *  integer source stride
2078  *
2079  * @param[in] dst_strd
2080  *  integer destination stride
2081  *
2082  * @param[in] nt
2083  *  integer Transform Block size
2084  *
2085  * @param[in] mode
2086  *  integer intraprediction mode
2087  *
2088  * @returns
2089  *
2090  * @remarks
2091  *  None
2092  *
2093  *******************************************************************************
2094  */
2095 
2096 
ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2097 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
2098                                                 WORD32 src_strd,
2099                                                 UWORD8 *pu1_dst,
2100                                                 WORD32 dst_strd,
2101                                                 WORD32 nt,
2102                                                 WORD32 mode)
2103 {
2104 
2105     WORD32 row, col;
2106     WORD32 intra_pred_ang;
2107     WORD32 pos, fract = 100, fract_prev;
2108     UNUSED(src_strd);
2109     if(0 == (nt & 7))
2110     {
2111 
2112         UWORD8 *pu1_ref_main_idx = pu1_ref;
2113         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2114 
2115         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2116         UWORD8 *pu1_dst_tmp2 = pu1_dst;
2117 
2118         WORD32 two_nt = 2 * nt;
2119 
2120         pu1_ref_main_idx += two_nt;
2121         pu1_ref_main_idx_1 += two_nt - 1;
2122 
2123         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2124         uint8x8_t shift_res;
2125         uint16x8_t mul_res1, mul_res2, add_res;
2126 
2127         /* Intra Pred Angle according to the mode */
2128         intra_pred_ang = gai4_ihevc_ang_table[mode];
2129 
2130         pu1_ref_main_idx -= 8;
2131         pu1_ref_main_idx_1 -= 8;
2132 
2133         for(col = 0; col < nt; col++)
2134         {
2135             fract_prev = fract;
2136 
2137             pos = ((col + 1) * intra_pred_ang);
2138             fract = pos & (31);
2139 
2140             if(fract_prev < fract)
2141             {
2142                 pu1_ref_main_idx += 1;
2143                 pu1_ref_main_idx_1 += 1;
2144             }
2145 
2146             dup_const_fract = vdup_n_u8((uint8_t)fract);
2147             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2148 
2149             for(row = nt; row > 0; row -= 8)
2150             {
2151                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2152                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2153 
2154                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2155                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2156 
2157                 add_res = vaddq_u16(mul_res1, mul_res2);
2158 
2159                 shift_res = vrshrn_n_u16(add_res, 5);
2160 
2161                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2162                 pu1_dst_tmp1 += dst_strd;
2163 
2164                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2165                 pu1_dst_tmp1 += dst_strd;
2166 
2167                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2168                 pu1_dst_tmp1 += dst_strd;
2169 
2170                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2171                 pu1_dst_tmp1 += dst_strd;
2172 
2173                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2174                 pu1_dst_tmp1 += dst_strd;
2175 
2176                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2177                 pu1_dst_tmp1 += dst_strd;
2178 
2179                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2180                 pu1_dst_tmp1 += dst_strd;
2181 
2182                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2183                 pu1_dst_tmp1 += dst_strd;
2184 
2185                 pu1_ref_main_idx -= 8;
2186                 pu1_ref_main_idx_1 -= 8;
2187 
2188             }
2189             pu1_dst_tmp2 += 1;
2190             pu1_dst_tmp1 = pu1_dst_tmp2;
2191 
2192             pu1_ref_main_idx += nt;
2193             pu1_ref_main_idx_1 += nt;
2194 
2195             pu1_ref_main_idx -= 1;
2196             pu1_ref_main_idx_1 -= 1;
2197 
2198         }
2199     }
2200     else
2201     {
2202         UWORD8 *pu1_ref_tmp1 = pu1_ref;
2203         UWORD8 *pu1_ref_tmp2 = pu1_ref;
2204         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2205         UWORD8 *pu1_dst_tmp2 = pu1_dst;
2206 
2207         pu1_ref_tmp1 += nt;
2208         pu1_ref_tmp2 += (nt - 1);
2209 
2210         uint8x8_t dup_fract, dup_32_fract, shift_res;
2211         uint16x8_t mul_res1, mul_res2, add_res;
2212         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
2213 
2214         pu1_ref_val1 = vdup_n_u32(0);
2215         pu1_ref_val2 = vdup_n_u32(0);
2216 
2217         /* Intra Pred Angle according to the mode */
2218         intra_pred_ang = gai4_ihevc_ang_table[mode];
2219 
2220 
2221         for(col = 0; col < nt; col++)
2222         {
2223             fract_prev = fract;
2224             pos = ((col + 1) * intra_pred_ang);
2225             fract = pos & (31);
2226             if(fract_prev < fract)
2227             {
2228                 pu1_ref_tmp1 += 1;
2229                 pu1_ref_tmp2 += 1;
2230             }
2231             dup_fract = vdup_n_u8((uint8_t)fract);
2232             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2233 
2234             for(row = nt; row > 0; row -= 4)
2235             {
2236                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2237                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2238 
2239                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2240                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2241 
2242                 add_res = vaddq_u16(mul_res1, mul_res2);
2243 
2244                 shift_res = vrshrn_n_u16(add_res, 5);
2245 
2246                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2247                 pu1_dst_tmp1 += dst_strd;
2248 
2249                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2250                 pu1_dst_tmp1 += dst_strd;
2251 
2252                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2253                 pu1_dst_tmp1 += dst_strd;
2254 
2255                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2256 
2257             }
2258             pu1_ref_tmp1 -= 1;
2259             pu1_ref_tmp2 -= 1;
2260 
2261             pu1_dst_tmp2 += 1;
2262             pu1_dst_tmp1 = pu1_dst_tmp2;
2263 
2264         }
2265 
2266 
2267     }
2268 
2269 }
2270 
2271 /**
2272  *******************************************************************************
2273  *
2274  * @brief
2275  *   Intra prediction interpolation filter for luma mode 11 to mode 17
2276  *
2277  * @par Description:
2278  *    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
2279  *    with reference  neighboring samples location pointed by 'pu1_ref' to the
2280  *    TU block location pointed by 'pu1_dst'
2281  *
2282  * @param[in] pu1_src
2283  *  UWORD8 pointer to the source
2284  *
2285  * @param[out] pu1_dst
2286  *  UWORD8 pointer to the destination
2287  *
2288  * @param[in] src_strd
2289  *  integer source stride
2290  *
2291  * @param[in] dst_strd
2292  *  integer destination stride
2293  *
2294  * @param[in] nt
2295  *  integer Transform Block size
2296  *
2297  * @param[in] mode
2298  *  integer intraprediction mode
2299  *
2300  * @returns
2301  *
2302  * @remarks
2303  *  None
2304  *
2305  *******************************************************************************
2306  */
2307 
2308 
ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2309 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
2310                                                   WORD32 src_strd,
2311                                                   UWORD8 *pu1_dst,
2312                                                   WORD32 dst_strd,
2313                                                   WORD32 nt,
2314                                                   WORD32 mode)
2315 {
2316 
2317     WORD32 row, col, k;
2318     WORD32 two_nt;
2319     WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
2320     WORD32 pos, fract = 1000, fract_prev;
2321     WORD32  ref_idx;
2322 
2323     UWORD8 *ref_main;
2324     UWORD8 *ref_main_tmp;
2325 
2326     UWORD8 *pu1_ref_tmp1 = pu1_ref;
2327     UWORD8 *pu1_ref_tmp2 = pu1_ref;
2328     UWORD8 *pu1_dst_tmp1 = pu1_dst;
2329     UWORD8 *pu1_dst_tmp2 = pu1_dst;
2330 
2331     UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
2332 
2333     uint16x8_t mul_res1, mul_res2, add_res;
2334     uint8x8_t dup_const_fract, dup_const_32_fract;
2335     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2336     uint8x8_t ref_left_t;
2337     uint32x2_t  ref_left_tmp;
2338     UNUSED(src_strd);
2339     ref_left_tmp = vdup_n_u32(0);
2340 
2341     inv_ang_sum = 128;
2342     two_nt = 2 * nt;
2343 
2344     intra_pred_ang = gai4_ihevc_ang_table[mode];
2345 
2346     inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
2347 
2348     pu1_ref_tmp1 += two_nt;
2349 
2350     ref_main = ref_temp + (nt - 1);
2351     ref_main_tmp = ref_main;
2352 
2353     if(0 == (nt & 7))
2354     {
2355         pu1_ref_tmp2 += (two_nt - 7);
2356 
2357         for(k = nt - 1; k >= 0; k -= 8)
2358         {
2359 
2360             ref_left_t = vld1_u8(pu1_ref_tmp2);
2361 
2362             ref_left_t = vrev64_u8(ref_left_t);
2363             vst1_u8(ref_main_tmp, ref_left_t);
2364             ref_main_tmp += 8;
2365             pu1_ref_tmp2 -= 8;
2366 
2367         }
2368 
2369     }
2370     else
2371     {
2372         uint8x8_t rev_val;
2373         pu1_ref_tmp2 += (two_nt - (nt - 1));
2374 
2375         for(k = nt - 1; k >= 0; k -= 8)
2376         {
2377 
2378             ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
2379 
2380             rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
2381             vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
2382 
2383         }
2384 
2385     }
2386 
2387     ref_main[nt] = pu1_ref[two_nt - nt];
2388 
2389     /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
2390 
2391     ref_idx = (nt * intra_pred_ang) >> 5;
2392 
2393     /* SIMD Optimization can be done using look-up table for the loop */
2394     /* For negative angled derive the main reference samples from side */
2395     /*  reference samples refer to section 8.4.4.2.6 */
2396     for(k = -1; k > ref_idx; k--)
2397     {
2398         inv_ang_sum += inv_ang;
2399         ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
2400     }
2401 
2402     UWORD8 *ref_main_tmp1 = ref_main;
2403     UWORD8 *ref_main_tmp2 = ref_main;
2404 
2405     ref_main_tmp2 += 1;
2406 
2407     if(0 == (nt & 7))
2408     {
2409         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2410         /* samples dependent on distance to obtain destination sample */
2411         for(col = 0; col < nt; col++)
2412         {
2413 
2414             fract_prev = fract;
2415             pos = ((col + 1) * intra_pred_ang);
2416             fract = pos & (31);
2417 
2418             if(fract_prev < fract)
2419             {
2420                 ref_main_tmp1 -= 1;
2421                 ref_main_tmp2 -= 1;
2422             }
2423 
2424             dup_const_fract = vdup_n_u8((uint8_t)fract);
2425             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2426 
2427             // Do linear filtering
2428             for(row = nt; row > 0; row -= 8)
2429             {
2430                 ref_main_idx = vld1_u8(ref_main_tmp1);
2431 
2432                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2433 
2434                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2435                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2436 
2437                 add_res = vaddq_u16(mul_res1, mul_res2);
2438 
2439                 shift_res = vrshrn_n_u16(add_res, 5);
2440 
2441                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2442                 pu1_dst_tmp1 += dst_strd;
2443 
2444                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2445                 pu1_dst_tmp1 += dst_strd;
2446 
2447                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2448                 pu1_dst_tmp1 += dst_strd;
2449 
2450                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2451                 pu1_dst_tmp1 += dst_strd;
2452 
2453                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2454                 pu1_dst_tmp1 += dst_strd;
2455 
2456                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2457                 pu1_dst_tmp1 += dst_strd;
2458 
2459                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2460                 pu1_dst_tmp1 += dst_strd;
2461 
2462                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2463                 pu1_dst_tmp1 += dst_strd;
2464 
2465                 ref_main_tmp1 += 8;
2466                 ref_main_tmp2 += 8;
2467             }
2468 
2469             ref_main_tmp1 -= nt;
2470             ref_main_tmp2 -= nt;
2471 
2472             pu1_dst_tmp2 += 1;
2473             pu1_dst_tmp1 = pu1_dst_tmp2;
2474         }
2475     }
2476     else
2477     {
2478         uint32x2_t ref_main_idx1, ref_main_idx2;
2479 
2480         ref_main_idx1 = vdup_n_u32(0);
2481         ref_main_idx2 = vdup_n_u32(0);
2482 
2483         for(col = 0; col < nt; col++)
2484         {
2485             fract_prev = fract;
2486             pos = ((col + 1) * intra_pred_ang);
2487             fract = pos & (31);
2488 
2489             if(fract_prev < fract)
2490             {
2491                 ref_main_tmp1 -= 1;
2492                 ref_main_tmp2 -= 1;
2493             }
2494 
2495             dup_const_fract = vdup_n_u8((uint8_t)fract);
2496             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2497 
2498             for(row = nt; row > 0; row -= 4)
2499             {
2500 
2501                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2502                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2503 
2504                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2505                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2506 
2507                 add_res = vaddq_u16(mul_res1, mul_res2);
2508 
2509                 shift_res = vrshrn_n_u16(add_res, 5);
2510 
2511                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2512                 pu1_dst_tmp1 += dst_strd;
2513 
2514                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2515                 pu1_dst_tmp1 += dst_strd;
2516 
2517                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2518                 pu1_dst_tmp1 += dst_strd;
2519 
2520                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2521                 pu1_dst_tmp1 += dst_strd;
2522 
2523             }
2524 
2525             pu1_dst_tmp2 += 1;
2526             pu1_dst_tmp1 = pu1_dst_tmp2;
2527 
2528         }
2529 
2530     }
2531 }
2532 
2533 /**
2534  *******************************************************************************
2535  *
2536  * @brief
2537  *   Intra prediction interpolation filter for luma mode 19 to mode 25
2538  *
2539  * @par Description:
2540  *    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
2541  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2542  *    block location pointed by 'pu1_dst'
2543  *
2544  * @param[in] pu1_src
2545  *  UWORD8 pointer to the source
2546  *
2547  * @param[out] pu1_dst
2548  *  UWORD8 pointer to the destination
2549  *
2550  * @param[in] src_strd
2551  *  integer source stride
2552  *
2553  * @param[in] dst_strd
2554  *  integer destination stride
2555  *
2556  * @param[in] nt
2557  *  integer Transform Block size
2558  *
2559  * @param[in] mode
2560  *  integer intraprediction mode
2561  *
2562  * @returns
2563  *
2564  * @remarks
2565  *  None
2566  *
2567  *******************************************************************************
2568  */
2569 
2570 
ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2571 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
2572                                                   WORD32 src_strd,
2573                                                   UWORD8 *pu1_dst,
2574                                                   WORD32 dst_strd,
2575                                                   WORD32 nt,
2576                                                   WORD32 mode)
2577 {
2578 
2579     WORD32 row, col, k;
2580     WORD32 two_nt, intra_pred_ang;
2581     WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
2582     WORD32 ref_idx;
2583     UWORD8 *ref_main;
2584     UWORD8 *ref_main_tmp;
2585     UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
2586 
2587     UWORD8 *pu1_ref_tmp1 = pu1_ref;
2588     UWORD8 *pu1_ref_tmp2 = pu1_ref;
2589     UWORD8 *pu1_dst_tmp1 = pu1_dst;
2590 
2591     uint16x8_t mul_res1, mul_res2, add_res;
2592     uint8x8_t dup_const_fract, dup_const_32_fract;
2593     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2594     uint8x8_t ref_above_t;
2595     uint32x2_t ref_above_tmp;
2596     UNUSED(src_strd);
2597     ref_above_tmp = vdup_n_u32(0);
2598 
2599     two_nt = 2 * nt;
2600     intra_pred_ang = gai4_ihevc_ang_table[mode];
2601     inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
2602 
2603     /* Intermediate reference samples for negative angle modes */
2604     /* This have to be removed during optimization*/
2605     pu1_ref_tmp1 += two_nt;
2606 
2607 
2608     ref_main = ref_temp + (nt - 1);
2609     ref_main_tmp = ref_main;
2610 
2611     if(0 == (nt & 7))
2612     {
2613         pu1_ref_tmp2 += (two_nt - 7);
2614         for(k = nt - 1; k >= 0; k -= 8)
2615         {
2616 
2617             ref_above_t = vld1_u8(pu1_ref_tmp1);
2618             vst1_u8(ref_main_tmp, ref_above_t);
2619             ref_main_tmp += 8;
2620             pu1_ref_tmp1 += 8;
2621 
2622         }
2623 
2624     }
2625     else
2626     {
2627         pu1_ref_tmp2 += (two_nt - (nt - 1));
2628 
2629         for(k = nt - 1; k >= 0; k -= 4)
2630         {
2631 
2632             ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
2633             vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
2634 
2635         }
2636 
2637     }
2638 
2639     ref_main[nt] = pu1_ref[two_nt + nt];
2640 
2641     /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
2642 
2643     ref_idx = (nt * intra_pred_ang) >> 5;
2644     inv_ang_sum = 128;
2645 
2646     /* SIMD Optimization can be done using look-up table for the loop */
2647     /* For negative angled derive the main reference samples from side */
2648     /*  reference samples refer to section 8.4.4.2.6 */
2649     for(k = -1; k > ref_idx; k--)
2650     {
2651         inv_ang_sum += inv_ang;
2652         ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
2653     }
2654 
2655     UWORD8 *ref_main_tmp1 = ref_main;
2656     UWORD8 *ref_main_tmp2 = ref_main;
2657 
2658     ref_main_tmp2 += 1;
2659 
2660     if(0 == (nt & 7))
2661     {
2662         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2663         /* samples dependent on distance to obtain destination sample */
2664         for(row = 0; row < nt; row++)
2665         {
2666 
2667             fract_prev = fract;
2668             pos = ((row + 1) * intra_pred_ang);
2669             fract = pos & (31);
2670 
2671             if(fract_prev < fract)
2672             {
2673                 ref_main_tmp1 -= 1;
2674                 ref_main_tmp2 -= 1;
2675             }
2676 
2677             dup_const_fract = vdup_n_u8((uint8_t)fract);
2678             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2679 
2680             // Do linear filtering
2681             for(col = nt; col > 0; col -= 8)
2682             {
2683                 ref_main_idx = vld1_u8(ref_main_tmp1);
2684 
2685                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2686 
2687                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2688                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2689 
2690                 add_res = vaddq_u16(mul_res1, mul_res2);
2691 
2692                 shift_res = vrshrn_n_u16(add_res, 5);
2693 
2694                 vst1_u8(pu1_dst_tmp1, shift_res);
2695                 pu1_dst_tmp1 += 8;
2696 
2697                 ref_main_tmp1 += 8;
2698                 ref_main_tmp2 += 8;
2699             }
2700 
2701             ref_main_tmp1 -= nt;
2702             ref_main_tmp2 -= nt;
2703 
2704             pu1_dst_tmp1 += (dst_strd - nt);
2705         }
2706     }
2707     else
2708     {
2709         uint32x2_t ref_main_idx1, ref_main_idx2;
2710 
2711         ref_main_idx1 = vdup_n_u32(0);
2712         ref_main_idx2 = vdup_n_u32(0);
2713 
2714         for(row = 0; row < nt; row++)
2715         {
2716             fract_prev = fract;
2717             pos = ((row + 1) * intra_pred_ang);
2718             fract = pos & (31);
2719 
2720             if(fract_prev < fract)
2721             {
2722                 ref_main_tmp1 -= 1;
2723                 ref_main_tmp2 -= 1;
2724             }
2725 
2726             dup_const_fract = vdup_n_u8((uint8_t)fract);
2727             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2728 
2729             for(col = nt; col > 0; col -= 4)
2730             {
2731 
2732                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2733                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2734 
2735                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2736                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2737 
2738                 add_res = vaddq_u16(mul_res1, mul_res2);
2739 
2740                 shift_res = vrshrn_n_u16(add_res, 5);
2741 
2742                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2743                 pu1_dst_tmp1 += 4;
2744 
2745             }
2746             pu1_dst_tmp1 += (dst_strd - nt);
2747         }
2748 
2749     }
2750 
2751 }
2752 
2753 /**
2754  *******************************************************************************
2755  *
2756  * @brief
2757  *    Intra prediction interpolation filter for luma mode 27 to mode 33
2758  *
2759  * @par Description:
2760  *    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
2761  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2762  *    block location pointed by 'pu1_dst'
2763  *
2764  * @param[in] pu1_src
2765  *  UWORD8 pointer to the source
2766  *
2767  * @param[out] pu1_dst
2768  *  UWORD8 pointer to the destination
2769  *
2770  * @param[in] src_strd
2771  *  integer source stride
2772  *
2773  * @param[in] dst_strd
2774  *  integer destination stride
2775  *
2776  * @param[in] nt
2777  *  integer Transform Block size
2778  *
2779  * @param[in] mode
2780  *  integer intraprediction mode
2781  *
2782  * @returns
2783  *
2784  * @remarks
2785  *  None
2786  *
2787  *******************************************************************************
2788  */
2789 
2790 
ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2791 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
2792                                                   WORD32 src_strd,
2793                                                   UWORD8 *pu1_dst,
2794                                                   WORD32 dst_strd,
2795                                                   WORD32 nt,
2796                                                   WORD32 mode)
2797 {
2798 
2799     WORD32 row, col;
2800     WORD32 intra_pred_ang;
2801     WORD32 pos, fract = 0, fract_prev;
2802 
2803     WORD32 two_nt = 2 * nt;
2804     UNUSED(src_strd);
2805     if(0 == (nt & 7))
2806     {
2807 
2808         UWORD8 *pu1_ref_main_idx = pu1_ref;
2809         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2810 
2811         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2812         pu1_ref_main_idx += (two_nt + 1);
2813         pu1_ref_main_idx_1 += (two_nt + 2);
2814 
2815         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2816         uint8x8_t shift_res;
2817         uint16x8_t mul_res1, mul_res2, add_res;
2818 
2819         /* Intra Pred Angle according to the mode */
2820         intra_pred_ang = gai4_ihevc_ang_table[mode];
2821 
2822         for(row = 0; row < nt; row++)
2823         {
2824             fract_prev = fract;
2825 
2826             pos = ((row + 1) * intra_pred_ang);
2827             fract = pos & (31);
2828 
2829             if(fract_prev > fract)
2830             {
2831                 pu1_ref_main_idx += 1;
2832                 pu1_ref_main_idx_1 += 1;
2833             }
2834 
2835             dup_const_fract = vdup_n_u8((uint8_t)fract);
2836             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2837 
2838             for(col = nt; col > 0; col -= 8)
2839             {
2840                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2841                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2842 
2843                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2844                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2845 
2846                 add_res = vaddq_u16(mul_res1, mul_res2);
2847 
2848                 shift_res = vrshrn_n_u16(add_res, 5);
2849 
2850                 vst1_u8(pu1_dst_tmp1, shift_res);
2851                 pu1_dst_tmp1 += 8;
2852 
2853                 pu1_ref_main_idx += 8;
2854                 pu1_ref_main_idx_1 += 8;
2855             }
2856 
2857             pu1_ref_main_idx -= nt;
2858             pu1_ref_main_idx_1 -= nt;
2859 
2860             pu1_dst_tmp1 += (dst_strd - nt);
2861         }
2862 
2863     }
2864     else
2865     {
2866         UWORD8 *pu1_ref_tmp1 = pu1_ref;
2867         UWORD8 *pu1_ref_tmp2 = pu1_ref;
2868         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2869 
2870         pu1_ref_tmp1 += (two_nt + 1);;
2871         pu1_ref_tmp2 += (two_nt + 2);;
2872 
2873         uint8x8_t dup_fract, dup_32_fract, shift_res;
2874         uint16x8_t mul_res1, mul_res2, add_res;
2875         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
2876 
2877         pu1_ref_val1 = vdup_n_u32(0);
2878         pu1_ref_val2 = vdup_n_u32(0);
2879 
2880         /* Intra Pred Angle according to the mode */
2881         intra_pred_ang = gai4_ihevc_ang_table[mode];
2882 
2883         for(row = 0; row < nt; row++)
2884         {
2885             fract_prev = fract;
2886             pos = ((row + 1) * intra_pred_ang);
2887             fract = pos & (31);
2888             if(fract_prev > fract)
2889             {
2890                 pu1_ref_tmp1 += 1;
2891                 pu1_ref_tmp2 += 1;
2892             }
2893             dup_fract = vdup_n_u8((uint8_t)fract);
2894             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2895 
2896             for(col = nt; col > 0; col -= 4)
2897             {
2898                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2899                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2900 
2901                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2902                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2903 
2904                 add_res = vaddq_u16(mul_res1, mul_res2);
2905 
2906                 shift_res = vrshrn_n_u16(add_res, 5);
2907 
2908                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2909                 pu1_dst_tmp1 += 4;
2910 
2911             }
2912 
2913             pu1_dst_tmp1 += (dst_strd - nt);
2914 
2915         }
2916 
2917 
2918     }
2919 
2920 }
2921