1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_intra_pred_filters_neon_intr.c
22 *
23 * @brief
24 * Contains function Definition for intra prediction interpolation filters
25 *
26 *
27 * @author
28 * Yogeswaran RS
29 *
30 * @par List of Functions:
31 * - ihevc_intra_pred_luma_planar()
32 * - ihevc_intra_pred_luma_dc()
33 * - ihevc_intra_pred_luma_horz()
34 * - ihevc_intra_pred_luma_ver()
35 * - ihevc_intra_pred_luma_mode2()
36 * - ihevc_intra_pred_luma_mode_18_34()
37 *
38 * @remarks
39 * None
40 *
41 *******************************************************************************
42 */
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46 #include <stdio.h>
47
48 #include "ihevc_typedefs.h"
49 #include "ihevc_intra_pred.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_func_selector.h"
52 #include "arm_neon.h"
53 #include "ihevc_platform_macros.h"
54 #include "ihevc_common_tables.h"
55
56 /****************************************************************************/
57 /* Constant Macros */
58 /****************************************************************************/
59 #define MAX_CU_SIZE 64
60 #define BIT_DEPTH 8
61 #define T32_4NT 128
62 #define T16_4NT 64
63
64
65
66 /*****************************************************************************/
67 /* Table Look-up */
68 /*****************************************************************************/
69
70 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
71
72 /*****************************************************************************/
73 /* Function Definition */
74 /*****************************************************************************/
75
76 /**
77 *******************************************************************************
78 *
79 * @brief
80 * Intra prediction interpolation filter for pu1_ref substitution
81 *
82 *
83 * @par Description:
84 * Reference substitution process for samples unavailable for prediction
85 * Refer to section 8.4.4.2.2
86 *
87 * @param[in] pu1_top_left
88 * UWORD8 pointer to the top-left
89 *
90 * @param[in] pu1_top
91 * UWORD8 pointer to the top
92 *
93 * @param[in] pu1_left
94 * UWORD8 pointer to the left
95 *
96 * @param[in] src_strd
97 * WORD32 Source stride
98 *
99 * @param[in] nbr_flags
100 * WORD32 neighbor availability flags
101 *
102 * @param[in] nt
103 * WORD32 transform Block size
104 *
105 * @param[in] dst_strd
106 * WORD32 Destination stride
107 *
108 * @returns
109 *
110 * @remarks
111 * None
112 *
113 *******************************************************************************
114 */
115
116
ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 * pu1_top_left,UWORD8 * pu1_top,UWORD8 * pu1_left,WORD32 src_strd,WORD32 nt,WORD32 nbr_flags,UWORD8 * pu1_dst,WORD32 dst_strd)117 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
118 UWORD8 *pu1_top,
119 UWORD8 *pu1_left,
120 WORD32 src_strd,
121 WORD32 nt,
122 WORD32 nbr_flags,
123 UWORD8 *pu1_dst,
124 WORD32 dst_strd)
125 {
126 UWORD8 pu1_ref;
127 WORD32 dc_val, i;
128 WORD32 total_samples = (4 * nt) + 1;
129 WORD32 two_nt = 2 * nt;
130 WORD32 three_nt = 3 * nt;
131 WORD32 get_bits;
132 WORD32 next;
133 WORD32 bot_left, left, top, tp_right, tp_left;
134 WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
135 UNUSED(dst_strd);
136 dc_val = 1 << (BIT_DEPTH - 1);
137
138 /* Neighbor Flag Structure*/
139 /* Top-Left | Top-Right | Top | Left | Bottom-Left
140 1 4 4 4 4
141 */
142
143 /* If no neighbor flags are present, fill the neighbor samples with DC value */
144 if(nbr_flags == 0)
145 {
146 for(i = 0; i < total_samples; i++)
147 {
148 pu1_dst[i] = dc_val;
149 }
150 }
151 else
152 {
153 /* Else fill the corresponding samples */
154 pu1_dst[two_nt] = *pu1_top_left;
155 UWORD8 *pu1_dst_tmp2 = pu1_dst;
156 UWORD8 *pu1_top_tmp = pu1_top;
157 pu1_dst_tmp2 += two_nt + 1;
158
159 for(i = 0; i < two_nt; i++)
160 pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
161
162 uint8x8_t src;
163 for(i = two_nt; i > 0; i -= 8)
164 {
165 src = vld1_u8(pu1_top_tmp);
166 pu1_top_tmp += 8;
167 vst1_u8(pu1_dst_tmp2, src);
168 pu1_dst_tmp2 += 8;
169 }
170
171 if(nt <= 8)
172 {
173 /* 1 bit extraction for all the neighboring blocks */
174 tp_left = (nbr_flags & 0x10000) >> 16;
175 bot_left = nbr_flags & 0x1;
176 left = (nbr_flags & 0x10) >> 4;
177 top = (nbr_flags & 0x100) >> 8;
178 tp_right = (nbr_flags & 0x1000) >> 12;
179
180 next = 1;
181
182 /* If bottom -left is not available, reverse substitution process*/
183 if(bot_left == 0)
184 {
185 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
186
187 /* Check for the 1st available sample from bottom-left*/
188 while(!a_nbr_flag[next])
189 next++;
190
191 /* If Left, top-left are available*/
192 if(next <= 2)
193 {
194 idx = nt * next;
195 pu1_ref = pu1_dst[idx];
196 for(i = 0; i < idx; i++)
197 pu1_dst[i] = pu1_ref;
198 }
199 else /* If top, top-right are available */
200 {
201 /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
202 idx = (nt * (next - 1)) + 1;
203 pu1_ref = pu1_dst[idx];
204 for(i = 0; i < idx; i++)
205 pu1_dst[i] = pu1_ref;
206 }
207 }
208
209 /* Forward Substitution Process */
210 /* If left is Unavailable, copy the last bottom-left value */
211
212 if(left == 0)
213 {
214 uint8x8_t dup_pu1_dst1;
215 UWORD8 *pu1_dst_const_nt = pu1_dst;
216 pu1_dst_const_nt += nt;
217
218 if(0 == (nt & 7))
219 {
220 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
221 for(i = nt; i > 0; i -= 8)
222 {
223 vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
224 pu1_dst_const_nt += 8;
225
226 }
227 }
228 else
229 {
230 //uint32x2_t dup_pu1_dst4;
231 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
232 //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
233 for(i = nt; i > 0; i -= 4)
234 {
235 vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
236 pu1_dst_const_nt += 4;
237
238 }
239
240 }
241
242 }
243 if(tp_left == 0)
244 pu1_dst[two_nt] = pu1_dst[two_nt - 1];
245 if(top == 0)
246 {
247
248 if(0 == (nt & 7))
249 {
250 uint8x8_t dup_pu1_dst2;
251 UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
252 pu1_dst_const_two_nt_1 += (two_nt + 1);
253 dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
254 for(i = nt; i > 0; i -= 8)
255 {
256 vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
257 pu1_dst_const_two_nt_1 += 8;
258
259 }
260 }
261 else
262 {
263 for(i = 0; i < nt; i++)
264 pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
265 }
266 }
267 if(tp_right == 0)
268 {
269 uint8x8_t dup_pu1_dst3;
270 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
271 pu1_dst_const_three_nt_1 += (three_nt + 1);
272 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
273 if(0 == (nt & 7))
274 {
275 for(i = nt; i > 0; i -= 8)
276 {
277 vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
278 pu1_dst_const_three_nt_1 += 8;
279
280 }
281 }
282 else
283 {
284 for(i = nt; i > 0; i -= 4)
285 {
286 vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
287 pu1_dst_const_three_nt_1 += 4;
288 }
289
290 }
291
292 }
293 }
294 if(nt == 16)
295 {
296 WORD32 nbr_flags_temp = 0;
297 nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
298 + ((nbr_flags & 0x300) >> 4)
299 + ((nbr_flags & 0x3000) >> 6)
300 + ((nbr_flags & 0x10000) >> 8);
301
302 /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
303 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
304 {
305 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
306
307 if(nbr_id_from_bl == 64)
308 nbr_id_from_bl = 32;
309
310 if(nbr_id_from_bl == 32)
311 {
312 /* for top left : 1 pel per nbr bit */
313 if(!((nbr_flags_temp >> 8) & 0x1))
314 {
315 nbr_id_from_bl++;
316 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */
317 }
318 }
319 /* Reverse Substitution Process*/
320 if(nbr_id_from_bl)
321 {
322 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
323 pu1_ref = pu1_dst[nbr_id_from_bl];
324 for(i = (nbr_id_from_bl - 1); i >= 0; i--)
325 {
326 pu1_dst[i] = pu1_ref;
327 }
328 }
329 }
330
331 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
332 while(nbr_id_from_bl < ((T16_4NT) + 1))
333 {
334 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
335 /* Devide by 8 to obtain the original index */
336 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
337
338 /* The Top-left flag is at the last bit location of nbr_flags*/
339 if(nbr_id_from_bl == (T16_4NT / 2))
340 {
341 get_bits = GET_BITS(nbr_flags_temp, 8);
342
343 /* only pel substitution for TL */
344 if(!get_bits)
345 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
346 }
347 else
348 {
349 get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
350 if(!get_bits)
351 {
352 /* 8 pel substitution (other than TL) */
353 pu1_ref = pu1_dst[nbr_id_from_bl - 1];
354 for(i = 0; i < 8; i++)
355 pu1_dst[nbr_id_from_bl + i] = pu1_ref;
356 }
357
358 }
359 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
360 }
361 }
362
363 if(nt == 32)
364 {
365 /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
366 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
367 {
368 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
369
370 if(nbr_id_from_bl == 64)
371 {
372 /* for top left : 1 pel per nbr bit */
373 if(!((nbr_flags >> 16) & 0x1))
374 {
375 /* top left not available */
376 nbr_id_from_bl++;
377 /* top and top right; 8 pels per nbr bit */
378 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
379 }
380 }
381 /* Reverse Substitution Process*/
382 if(nbr_id_from_bl)
383 {
384 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
385 pu1_ref = pu1_dst[nbr_id_from_bl];
386 for(i = (nbr_id_from_bl - 1); i >= 0; i--)
387 pu1_dst[i] = pu1_ref;
388 }
389 }
390
391 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
392 while(nbr_id_from_bl < ((T32_4NT)+1))
393 {
394 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
395 /* Devide by 8 to obtain the original index */
396 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
397
398 /* The Top-left flag is at the last bit location of nbr_flags*/
399 if(nbr_id_from_bl == (T32_4NT / 2))
400 {
401 get_bits = GET_BITS(nbr_flags, 16);
402 /* only pel substitution for TL */
403 if(!get_bits)
404 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
405 }
406 else
407 {
408 get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
409 if(!get_bits)
410 {
411 /* 8 pel substitution (other than TL) */
412 pu1_ref = pu1_dst[nbr_id_from_bl - 1];
413 for(i = 0; i < 8; i++)
414 pu1_dst[nbr_id_from_bl + i] = pu1_ref;
415 }
416
417 }
418 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
419 }
420 }
421
422 }
423
424 }
425
426 /**
427 *******************************************************************************
428 *
429 * @brief
430 * Intra prediction interpolation filter for ref_filtering
431 *
432 *
433 * @par Description:
434 * Reference DC filtering for neighboring samples dependent on TU size and
435 * mode Refer to section 8.4.4.2.3 in the standard
436 *
437 * @param[in] pu1_src
438 * UWORD8 pointer to the source
439 *
440 * @param[out] pu1_dst
441 * UWORD8 pointer to the destination
442 *
443 * @param[in] nt
444 * integer Transform Block size
445 *
446 * @param[in] mode
447 * integer intraprediction mode
448 *
449 * @returns
450 *
451 * @remarks
452 * None
453 *
454 *******************************************************************************
455 */
456
457
ihevc_intra_pred_ref_filtering_neonintr(UWORD8 * pu1_src,WORD32 nt,UWORD8 * pu1_dst,WORD32 mode,WORD32 strong_intra_smoothing_enable_flag)458 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
459 WORD32 nt,
460 UWORD8 *pu1_dst,
461 WORD32 mode,
462 WORD32 strong_intra_smoothing_enable_flag)
463 {
464 WORD32 filter_flag;
465 WORD32 i = 0;
466 WORD32 four_nt = 4 * nt;
467
468 WORD32 src_4nt;
469 WORD32 src_0nt;
470 /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1 */
471 /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values */
472 UWORD8 *pu1_src_tmp_0 = pu1_src;
473 UWORD8 *pu1_src_tmp_1;
474 UWORD8 *pu1_src_tmp_2;
475 UWORD8 *pu1_dst_tmp_0 = pu1_dst;
476 UWORD8 *pu1_dst_tmp_1;
477
478 uint8x8_t src_val_0, src_val_2;
479 uint8x8_t src_val_1, shift_res;
480 uint8x8_t dup_const_2;
481 uint16x8_t mul_res, add_res;
482 WORD32 bi_linear_int_flag = 0;
483 WORD32 abs_cond_left_flag = 0;
484 WORD32 abs_cond_top_flag = 0;
485 WORD32 dc_val = 1 << (BIT_DEPTH - 5);
486 shift_res = vdup_n_u8(0);
487
488 filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
489
490 if(0 == filter_flag)
491 {
492 if(pu1_src == pu1_dst)
493 {
494 return;
495 }
496 else
497 {
498 for(i = four_nt; i > 0; i -= 8)
499 {
500 src_val_0 = vld1_u8(pu1_src_tmp_0);
501 pu1_src_tmp_0 += 8;
502 vst1_u8(pu1_dst_tmp_0, src_val_0);
503 pu1_dst_tmp_0 += 8;
504 }
505 pu1_dst[four_nt] = pu1_src[four_nt];
506 }
507 }
508
509 else
510 {
511 /* If strong intra smoothin is enabled and transform size is 32 */
512 if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
513 {
514 /*Strong Intra Filtering*/
515 abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
516 - (2 * pu1_src[3 * nt]))) < dc_val;
517 abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
518 - (2 * pu1_src[nt]))) < dc_val;
519
520 bi_linear_int_flag = ((1 == abs_cond_left_flag)
521 && (1 == abs_cond_top_flag));
522 }
523
524 src_4nt = pu1_src[4 * nt];
525 src_0nt = pu1_src[0];
526 /* Strong filtering of reference samples */
527 if(1 == bi_linear_int_flag)
528 {
529 WORD32 two_nt = four_nt >> 1;
530
531 WORD32 pu1_src_0_val = pu1_src[0];
532 WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
533 WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
534
535 WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
536 uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
537
538 WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
539 uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
540
541 const UWORD8 *const_col_i;
542 uint8x8_t const_col_i_val;
543 uint16x8_t prod_val_1;
544 uint16x8_t prod_val_2;
545 uint16x8_t prod_val_3;
546 uint16x8_t prod_val_4;
547 uint8x8_t res_val_1;
548 uint8x8_t res_val_2;
549 uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
550 uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
551 uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
552 pu1_dst_tmp_0 = pu1_dst + 1;
553 pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
554
555 const_col_i = gau1_ihevc_planar_factor + 1;
556
557 for(i = two_nt; i > 0; i -= 8)
558 {
559 const_col_i_val = vld1_u8(const_col_i);
560 const_col_i += 8;
561
562 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
563 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
564
565 res_val_1 = vrshrn_n_u16(prod_val_2, 6);
566 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
567
568 vst1_u8(pu1_dst_tmp_0, res_val_1);
569 pu1_dst_tmp_0 += 8;
570 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
571
572 res_val_2 = vrshrn_n_u16(prod_val_4, 6);
573 vst1_u8(pu1_dst_tmp_1, res_val_2);
574 pu1_dst_tmp_1 += 8;
575 }
576 pu1_dst[2 * nt] = pu1_src[2 * nt];
577 }
578 else
579 {
580 pu1_src_tmp_1 = pu1_src + 1;
581 pu1_src_tmp_2 = pu1_src + 2;
582 pu1_dst_tmp_0 += 1;
583
584 dup_const_2 = vdup_n_u8(2);
585
586 /* Extremities Untouched*/
587 pu1_dst[0] = pu1_src[0];
588
589 /* To avoid the issue when the dest and src has the same pointer this load has been done
590 * outside and the 2nd consecutive load is done before the store of the 1st */
591
592 /* Perform bilinear filtering of Reference Samples */
593 for(i = (four_nt - 1); i > 0; i -= 8)
594 {
595 src_val_0 = vld1_u8(pu1_src_tmp_0);
596 pu1_src_tmp_0 += 8;
597
598 src_val_2 = vld1_u8(pu1_src_tmp_2);
599 pu1_src_tmp_2 += 8;
600
601 src_val_1 = vld1_u8(pu1_src_tmp_1);
602 pu1_src_tmp_1 += 8;
603
604 if(i < four_nt - 1)
605 {
606 vst1_u8(pu1_dst_tmp_0, shift_res);
607 pu1_dst_tmp_0 += 8;
608 }
609
610 add_res = vaddl_u8(src_val_0, src_val_2);
611
612 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
613 shift_res = vrshrn_n_u16(mul_res, 2);
614
615 }
616 vst1_u8(pu1_dst_tmp_0, shift_res);
617 pu1_dst_tmp_0 += 8;
618 }
619 pu1_dst[4 * nt] = src_4nt;
620 pu1_dst[0] = src_0nt;
621 }
622
623 }
624
625
626
627 /**
628 *******************************************************************************
629 *
630 * @brief
631 * Intra prediction interpolation filter for luma planar
632 *
633 * @par Description:
634 * Planar Intraprediction with reference neighboring samples location
635 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
636 *
637 * @param[in] pu1_src
638 * UWORD8 pointer to the source
639 *
640 * @param[out] pu1_dst
641 * UWORD8 pointer to the destination
642 *
643 * @param[in] src_strd
644 * integer source stride
645 *
646 * @param[in] dst_strd
647 * integer destination stride
648 *
649 * @param[in] nt
650 * integer Transform Block size
651 *
652 * @param[in] wd
653 * integer width of the array
654 *
655 * @returns
656 *
657 * @remarks
658 * None
659 *
660 *******************************************************************************
661 */
662
ihevc_intra_pred_luma_planar_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)663 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
664 WORD32 src_strd,
665 UWORD8 *pu1_dst,
666 WORD32 dst_strd,
667 WORD32 nt,
668 WORD32 mode)
669 {
670 /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor) */
671 /* load const_nt_1_col values into a d register */
672 /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1 */
673 /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup */
674 /* log2nt + 1 is taken care while assigning the values itself */
675 /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
676
677 WORD32 row, col = 0;
678 WORD32 log2nt_plus1 = 6;
679 WORD32 two_nt, three_nt;
680 UWORD8 *pu1_ref_two_nt_1;
681 UWORD8 *pu1_dst_tmp;
682 const UWORD8 *const_nt_1_col;
683 uint8x8_t const_nt_1_col_t;
684 const UWORD8 *const_col_1;
685 uint8x8_t const_col_1_t;
686 uint8_t const_nt_1_row;
687 uint8x8_t const_nt_1_row_dup;
688 uint8_t const_row_1;
689 uint8x8_t const_row_1_dup;
690 uint8_t const_nt = nt;
691 uint16x8_t const_nt_dup;
692 uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
693 uint8x8_t pu1_ref_nt_1_dup;
694 uint8_t pu1_ref_two_nt_1_row;
695 uint8_t pu1_ref_three_nt_1;
696 uint8x8_t pu1_ref_two_nt_1_row_dup;
697 uint8x8_t pu1_ref_two_nt_1_t;
698 uint8x8_t pu1_ref_three_nt_1_dup;
699 uint16x8_t prod_t1;
700 uint16x8_t prod_t2;
701 uint16x8_t sto_res_tmp;
702 uint8x8_t sto_res;
703 int16x8_t log2nt_dup;
704 UNUSED(src_strd);
705 UNUSED(mode);
706 log2nt_plus1 = 32 - CLZ(nt);
707 two_nt = 2 * nt;
708 three_nt = 3 * nt;
709 /* loops have been unrolld considering the fact width is multiple of 8 */
710 if(0 == (nt & 7))
711 {
712 pu1_dst_tmp = pu1_dst;
713 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
714
715 const_col_1 = gau1_ihevc_planar_factor + 1;
716 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
717
718 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
719 const_nt_dup = vdupq_n_u16(const_nt);
720
721 log2nt_dup = vdupq_n_s16(log2nt_plus1);
722 log2nt_dup = vnegq_s16(log2nt_dup);
723
724 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
725
726 for(row = 0; row < nt; row++)
727 {
728 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
729 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
730
731 const_nt_1_row = nt - 1 - row;
732 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
733
734 const_row_1 = row + 1;
735 const_row_1_dup = vdup_n_u8(const_row_1);
736
737 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
738
739 const_col_1 = gau1_ihevc_planar_factor + 1;
740 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
741
742 for(col = nt; col > 0; col -= 8)
743 {
744 const_nt_1_col_t = vld1_u8(const_nt_1_col);
745 const_nt_1_col -= 8;
746 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
747
748 const_col_1_t = vld1_u8(const_col_1);
749 const_col_1 += 8;
750 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
751
752 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
753 pu1_ref_two_nt_1 += 8;
754 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
755
756 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
757 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
758 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
759 prod_t1 = vaddq_u16(prod_t1, prod_t2);
760
761 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
762 sto_res = vmovn_u16(sto_res_tmp);
763 vst1_u8(pu1_dst_tmp, sto_res);
764 pu1_dst_tmp += 8;
765 }
766 pu1_dst_tmp += dst_strd - nt;
767 }
768 }
769 /* loops have been unrolld considering the fact width is multiple of 4 */
770 /* If column is multiple of 4 then height should be multiple of 2 */
771 else
772 {
773 uint8x8_t const_row_1_dup1;
774 uint8x8_t pu1_ref_two_nt_1_t1;
775 uint8x8_t const_nt_1_col_t1;
776 uint8x8_t const_col_1_t1;
777 uint8x8_t pu1_ref_two_nt_1_row_dup1;
778 uint8x8_t const_nt_1_row_dup1;
779
780 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
781
782 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
783 const_nt_dup = vdupq_n_u16(const_nt);
784
785 log2nt_dup = vdupq_n_s16(log2nt_plus1);
786 log2nt_dup = vnegq_s16(log2nt_dup);
787
788 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
789
790 for(row = 0; row < nt; row += 2)
791 {
792 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
793 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
794 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
795 pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
796 pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
797
798 const_nt_1_row = nt - 1 - row;
799 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
800 const_nt_1_row = nt - 2 - row;
801 const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
802 const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
803
804 const_row_1 = row + 1;
805 const_row_1_dup = vdup_n_u8(const_row_1);
806 const_row_1 = row + 2;
807 const_row_1_dup1 = vdup_n_u8(const_row_1);
808 const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
809
810 const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
811
812 const_col_1 = gau1_ihevc_planar_factor + 1;
813
814 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
815
816 for(col = nt; col > 0; col -= 4)
817 {
818 const_nt_1_col_t = vld1_u8(const_nt_1_col);
819 const_nt_1_col -= 4;
820 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
821
822 const_col_1_t = vld1_u8(const_col_1);
823 const_col_1 += 4;
824 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
825
826 pu1_dst_tmp = pu1_dst;
827 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
828
829 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
830 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
831
832 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
833 pu1_ref_two_nt_1 += 4;
834 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
835
836 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
837 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
838
839 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
840 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
841
842 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
843 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
844 prod_t1 = vaddq_u16(prod_t1, prod_t2);
845
846 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
847 sto_res = vmovn_u16(sto_res_tmp);
848
849 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
850 pu1_dst_tmp += dst_strd;
851
852 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
853 pu1_dst += 4;
854 }
855 pu1_dst += 2 * dst_strd - nt;
856 }
857 }
858
859 }
860 /* INTRA_PRED_LUMA_PLANAR */
861
862 /**
863 *******************************************************************************
864 *
865 * @brief
866 * Intra prediction interpolation filter for luma dc
867 *
868 * @par Description:
869 * Intraprediction for DC mode with reference neighboring samples location
870 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
871 *
872 * @param[in] pu1_src
873 * UWORD8 pointer to the source
874 *
875 * @param[out] pu1_dst
876 * UWORD8 pointer to the destination
877 *
878 * @param[in] src_strd
879 * integer source stride
880 *
881 * @param[in] dst_strd
882 * integer destination stride
883 *
884 * @param[in] nt
885 * integer Transform Block size
886 *
887 * @param[in] wd
888 * integer width of the array
889 *
890 * @returns
891 *
892 * @remarks
893 * None
894 *
895 *******************************************************************************
896 */
897
ihevc_intra_pred_luma_dc_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)898 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
899 WORD32 src_strd,
900 UWORD8 *pu1_dst,
901 WORD32 dst_strd,
902 WORD32 nt,
903 WORD32 mode)
904 {
905 WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
906 WORD32 i = 0;
907 WORD32 row = 0, col = 0, col_count;
908 WORD32 log2nt_plus1 = 6;
909 WORD32 two_nt = 0;
910 uint16x8_t ref_load_q;
911 uint16x8_t three_dc_val_t;
912 uint8x8_t sto_res_tmp;
913 uint8x8_t sto_res_tmp1;
914 uint8x8_t sto_res_tmp2;
915 uint8x8_t sto_res_tmp3;
916 uint8x8_t sto_res_tmp4;
917 uint8x8_t dc_val_t;
918
919 UWORD8 *pu1_ref_tmp;
920 UWORD8 *pu1_ref_tmp1;
921 UWORD8 *pu1_dst_tmp;
922 UWORD8 *pu1_dst_tmp1;
923 UWORD8 *pu1_dst_tmp2;
924 UNUSED(src_strd);
925 UNUSED(mode);
926
927 /* log2nt + 1 is taken care while assigning the values itself. */
928 log2nt_plus1 = 32 - CLZ(nt);
929
930 /* loops have been unrolld considering the fact width is multiple of 8 */
931 if(0 == (nt & 7))
932 {
933 uint8x8_t ref_load1;
934 uint8x8_t ref_load2;
935 uint16x4_t acc_dc_pair1;
936 uint32x2_t acc_dc_pair2;
937 uint64x1_t acc_dc = vdup_n_u64(col);
938
939 two_nt = 2 * nt;
940 pu1_ref_tmp = pu1_ref + nt;
941 pu1_ref_tmp1 = pu1_ref + two_nt + 1;
942
943 for(i = two_nt; i > nt; i -= 8)
944 {
945 ref_load1 = vld1_u8(pu1_ref_tmp);
946 pu1_ref_tmp += 8;
947 acc_dc_pair1 = vpaddl_u8(ref_load1);
948
949 ref_load2 = vld1_u8(pu1_ref_tmp1);
950 pu1_ref_tmp1 += 8;
951
952 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
953 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
954
955 acc_dc_pair1 = vpaddl_u8(ref_load2);
956 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
957 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
958 }
959
960 dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
961 dc_val_t = vdup_n_u8(dc_val);
962 two_dc_val = 2 * dc_val;
963 three_dc_val = 3 * dc_val;
964 three_dc_val += 2;
965
966 three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
967 pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
968 pu1_dst_tmp = pu1_dst;
969
970
971 if(nt == 32)
972 {
973 for(row = 0; row < nt; row++)
974 {
975 for(col = nt; col > 0; col -= 8)
976 {
977 vst1_u8(pu1_dst_tmp, dc_val_t);
978 pu1_dst_tmp += 8;
979 }
980 pu1_dst_tmp += dst_strd - nt;
981 }
982 }
983 else
984
985 {
986 for(col = nt; col > 0; col -= 8)
987 {
988 ref_load1 = vld1_u8(pu1_ref_tmp);
989 pu1_ref_tmp += 8;
990 ref_load_q = vmovl_u8(ref_load1);
991 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
992 ref_load_q = vshrq_n_u16(ref_load_q, 2);
993 sto_res_tmp = vmovn_u16(ref_load_q);
994 vst1_u8(pu1_dst_tmp, sto_res_tmp);
995 pu1_dst_tmp += 8;
996 }
997
998 pu1_ref_tmp = pu1_ref + two_nt - 9;
999 pu1_dst_tmp = pu1_dst + dst_strd;
1000 col_count = nt - 8;
1001
1002 /* Except the first row the remaining rows are done here */
1003 /* Both column and row has been unrolled by 8 */
1004 /* Store has been taken care for the unrolling */
1005 /* Except the 1st column of the remaining rows(other than 1st row), the values are */
1006 /* constant hence it is extracted with an constant value and stored */
1007 /* If the column is greater than 8, then the remaining values are constant which is */
1008 /* taken care in the inner for loop */
1009
1010 for(row = nt; row > 0; row -= 8)
1011 {
1012 pu1_dst_tmp1 = pu1_dst_tmp + 8;
1013 ref_load1 = vld1_u8(pu1_ref_tmp);
1014 pu1_ref_tmp -= 8;
1015 ref_load_q = vmovl_u8(ref_load1);
1016 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
1017 ref_load_q = vshrq_n_u16(ref_load_q, 2);
1018 sto_res_tmp = vmovn_u16(ref_load_q);
1019
1020 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
1021
1022 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
1023 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1024 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1025 pu1_dst_tmp += dst_strd;
1026
1027 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
1028 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1029 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1030 pu1_dst_tmp += dst_strd;
1031
1032 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
1033 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1034 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1035 pu1_dst_tmp += dst_strd;
1036
1037 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
1038 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
1039 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1040 pu1_dst_tmp += dst_strd;
1041
1042 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
1043 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1044 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1045 pu1_dst_tmp += dst_strd;
1046
1047 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
1048 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1049 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1050 pu1_dst_tmp += dst_strd;
1051
1052 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
1053 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1054 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1055 pu1_dst_tmp += dst_strd;
1056 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1057 if(row != 8)
1058 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1059 pu1_dst_tmp += dst_strd;
1060
1061 for(col = col_count; col > 0; col -= 8)
1062 {
1063 pu1_dst_tmp2 = pu1_dst_tmp1;
1064 vst1_u8(pu1_dst_tmp1, dc_val_t);
1065 pu1_dst_tmp1 += dst_strd;
1066 vst1_u8(pu1_dst_tmp1, dc_val_t);
1067 pu1_dst_tmp1 += dst_strd;
1068 vst1_u8(pu1_dst_tmp1, dc_val_t);
1069 pu1_dst_tmp1 += dst_strd;
1070 vst1_u8(pu1_dst_tmp1, dc_val_t);
1071 pu1_dst_tmp1 += dst_strd;
1072 vst1_u8(pu1_dst_tmp1, dc_val_t);
1073 pu1_dst_tmp1 += dst_strd;
1074 vst1_u8(pu1_dst_tmp1, dc_val_t);
1075 pu1_dst_tmp1 += dst_strd;
1076 vst1_u8(pu1_dst_tmp1, dc_val_t);
1077 pu1_dst_tmp1 += dst_strd;
1078
1079 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1080 if(row != 8)
1081 vst1_u8(pu1_dst_tmp1, dc_val_t);
1082 pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
1083 }
1084 }
1085 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1086 }
1087 }
1088 /* loops have been unrolld considering the fact width is multiple of 4 */
1089 else
1090 {
1091 WORD32 acc_dc;
1092 two_nt = 2 * nt;
1093
1094 acc_dc = 0;
1095 pu1_ref_tmp = pu1_ref + nt + 1;
1096 for(i = nt; i < two_nt; i++)
1097 {
1098 acc_dc += pu1_ref[i];
1099 acc_dc += pu1_ref_tmp[i];
1100 }
1101 dc_val = (acc_dc + nt) >> (log2nt_plus1);
1102 two_dc_val = 2 * dc_val;
1103 three_dc_val = 3 * dc_val;
1104 three_dc_val = three_dc_val + 2;
1105 dc_val_t = vdup_n_u8(dc_val);
1106
1107 if(nt == 32)
1108 {
1109 pu1_dst_tmp = pu1_dst;
1110 for(row = 0; row < nt; row++)
1111 {
1112 for(col = nt; col > 0; col -= 4)
1113 {
1114 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1115 pu1_dst_tmp += 4;
1116 }
1117 pu1_dst_tmp += dst_strd - nt;
1118 }
1119 }
1120 else
1121
1122 {
1123 for(col = 1; col < nt; col++)
1124 {
1125 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
1126 }
1127
1128 pu1_dst_tmp = pu1_dst + dst_strd + 0;
1129 /* Since first row is already updated before, loop count is nt-1 */
1130 for(row = nt - 1; row > 0; row -= 1)
1131 {
1132 for(col = nt; col > 0; col -= 4)
1133 {
1134 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1135 pu1_dst_tmp += 4;
1136 }
1137 pu1_dst_tmp += dst_strd - nt;
1138 }
1139
1140 for(row = 1; row < nt; row++)
1141 {
1142 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
1143 }
1144 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1145 }
1146 }
1147 }
1148 /* INTRA_PRED_LUMA_DC */
1149
1150 /**
1151 *******************************************************************************
1152 *
1153 * @brief
1154 * Intra prediction interpolation filter for horizontal luma variable.
1155 *
1156 * @par Description:
1157 * Horizontal intraprediction with reference neighboring samples location
1158 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
1159 *
1160 * @param[in] pu1_src
1161 * UWORD8 pointer to the source
1162 *
1163 * @param[out] pu1_dst
1164 * UWORD8 pointer to the destination
1165 *
1166 * @param[in] src_strd
1167 * integer source stride
1168 *
1169 * @param[in] dst_strd
1170 * integer destination stride
1171 *
1172 * @param[in] nt
1173 * integer Transform Block size
1174 *
1175 * @param[in] wd
1176 * integer width of the array
1177 *
1178 * @returns
1179 *
1180 * @remarks
1181 * None
1182 *
1183 *******************************************************************************
1184 */
1185
ihevc_intra_pred_luma_horz_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1186 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
1187 WORD32 src_strd,
1188 UWORD8 *pu1_dst,
1189 WORD32 dst_strd,
1190 WORD32 nt,
1191 WORD32 mode)
1192 {
1193
1194 WORD32 row, col;
1195 WORD32 two_nt;
1196 UNUSED(src_strd);
1197 UNUSED(mode);
1198
1199 two_nt = 2 * nt;
1200
1201
1202 UWORD8 *pu1_dst_tmp = pu1_dst;
1203 UWORD32 pu1_val;
1204 uint8x8_t pu1_val_two_nt_1_row;
1205 if(nt == 32)
1206 {
1207 pu1_dst_tmp = pu1_dst;
1208 for(row = 0; row < nt; row++)
1209 {
1210 pu1_val = pu1_ref[two_nt - 1 - row];
1211 pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
1212 for(col = nt; col > 0; col -= 8)
1213 {
1214 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
1215 pu1_dst_tmp += 8;
1216 }
1217 pu1_dst_tmp += dst_strd - nt;
1218 }
1219 }
1220 else
1221
1222
1223 /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
1224 /* naming of variables made according to the operation(instructions) it performs*/
1225 /* (eg. shift_val which contains the shifted value, */
1226 /* add_sat which has add and saturated value) */
1227 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1228 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
1229 {
1230 if(0 != (nt & 7)) /* cond for multiple of 4 */
1231 {
1232 UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
1233 UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
1234 UWORD8 *pu1_dst_4 = pu1_dst;
1235 UWORD8 *pu1_dst_4_tmp = pu1_dst;
1236
1237 uint32x2_t pu1_ref_val1, pu1_ref_val2;
1238 uint8x8_t dup_sub, round_val, dup_val;
1239 uint16x8_t dup_add, sub_val;
1240 int16x8_t shift_val, add_sat;
1241
1242 pu1_ref_val1 = vdup_n_u32(0);
1243 pu1_ref_val2 = vdup_n_u32(0);
1244
1245 dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1246
1247 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1248
1249 pu1_ref_4_two_nt_plus1 += (two_nt + 1);
1250
1251 pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
1252
1253 for(row = nt; row > 0; row -= 4)
1254 {
1255 for(col = nt; col > 0; col -= 4)
1256 {
1257 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
1258 sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
1259 shift_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1260
1261 add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
1262 round_val = vqmovun_s16(add_sat);
1263 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
1264 pu1_dst_4 += dst_strd;
1265
1266 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
1267 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
1268 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1269 pu1_dst_4 += dst_strd;
1270
1271 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
1272 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1273 pu1_dst_4 += dst_strd;
1274
1275 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
1276 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1277 pu1_dst_4 += dst_strd;
1278
1279
1280 }
1281 /* worst cases */
1282 pu1_ref_4_two_nt_minus_nt += 3;
1283 pu1_ref_4_two_nt_plus1 += 4;
1284 pu1_dst_4 = (pu1_dst_4_tmp + 4);
1285 }
1286
1287 }
1288
1289 /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
1290 /* naming of variables made according to the operation(instructions) it performs */
1291 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1292 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1293
1294 else
1295 {
1296 UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1297 UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1298
1299 UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1300 UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
1301 UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
1302
1303 uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
1304 uint16x8_t sub_res, dup_add;
1305 int16x8_t shift_res, add_res;
1306
1307 dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1308 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1309
1310 pu1_ref_tmp_1 += (two_nt + 1);
1311 pu1_ref_tmp_2 += (two_nt - 1);
1312
1313 for(col = nt; col > 0; col -= 8)
1314 {
1315 src_tmp = vld1_u8(pu1_ref_tmp_1);
1316 pu1_ref_tmp_1 += 8;
1317
1318 sub_res = vsubl_u8(src_tmp, dup_sub);
1319 shift_res = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
1320 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
1321 round_val = vqmovun_s16(add_res);
1322 vst1_u8(pu1_dst_tmp_1, round_val);
1323 pu1_dst_tmp_1 += 8;
1324 }
1325
1326 for(row = nt; row > 0; row -= 8)
1327 {
1328 pu1_ref_tmp_2 -= 8;
1329
1330 src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
1331 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
1332
1333 dup_1 = vdup_lane_u8(rev_res, 0);
1334 dup_2 = vdup_lane_u8(rev_res, 1);
1335 dup_3 = vdup_lane_u8(rev_res, 2);
1336 dup_4 = vdup_lane_u8(rev_res, 3);
1337 dup_5 = vdup_lane_u8(rev_res, 4);
1338 dup_6 = vdup_lane_u8(rev_res, 5);
1339 dup_7 = vdup_lane_u8(rev_res, 6);
1340 dup_8 = vdup_lane_u8(rev_res, 7);
1341
1342 for(col = nt; col > 0; col -= 8)
1343 {
1344 pu1_dst_tmp_2 = pu1_dst_tmp_3;
1345
1346 vst1_u8(pu1_dst_tmp_2, dup_1);
1347 pu1_dst_tmp_2 += dst_strd;
1348
1349 vst1_u8(pu1_dst_tmp_2, dup_2);
1350 pu1_dst_tmp_2 += dst_strd;
1351
1352 vst1_u8(pu1_dst_tmp_2, dup_3);
1353 pu1_dst_tmp_2 += dst_strd;
1354
1355 vst1_u8(pu1_dst_tmp_2, dup_4);
1356 pu1_dst_tmp_2 += dst_strd;
1357
1358 vst1_u8(pu1_dst_tmp_2, dup_5);
1359 pu1_dst_tmp_2 += dst_strd;
1360
1361 vst1_u8(pu1_dst_tmp_2, dup_6);
1362 pu1_dst_tmp_2 += dst_strd;
1363
1364 vst1_u8(pu1_dst_tmp_2, dup_7);
1365 pu1_dst_tmp_2 += dst_strd;
1366
1367 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1368 if(row != 8)
1369 vst1_u8(pu1_dst_tmp_2, dup_8);
1370 pu1_dst_tmp_2 += dst_strd;
1371
1372 pu1_dst_tmp_3 += 8;
1373 }
1374 pu1_dst_tmp_2 -= (nt - 8);
1375 pu1_dst_tmp_3 = pu1_dst_tmp_2;
1376 }
1377 }
1378 }
1379 }
1380 /* INTRA_PRED_LUMA_HORZ */
1381
1382 /**
1383 *******************************************************************************
1384 *
1385 * @brief
1386 * Intra prediction interpolation filter for vertical luma variable.
1387 *
1388 * @par Description:
1389 * Horizontal intraprediction with reference neighboring samples location
1390 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
1391 *
1392 * @param[in] pu1_src
1393 * UWORD8 pointer to the source
1394 *
1395 * @param[out] pu1_dst
1396 * UWORD8 pointer to the destination
1397 *
1398 * @param[in] src_strd
1399 * integer source stride
1400 *
1401 * @param[in] dst_strd
1402 * integer destination stride
1403 *
1404 * @param[in] nt
1405 * integer Transform Block size
1406 *
1407 * @param[in] wd
1408 * integer width of the array
1409 *
1410 * @returns
1411 *
1412 * @remarks
1413 * None
1414 *
1415 *******************************************************************************
1416 */
1417
ihevc_intra_pred_luma_ver_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1418 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
1419 WORD32 src_strd,
1420 UWORD8 *pu1_dst,
1421 WORD32 dst_strd,
1422 WORD32 nt,
1423 WORD32 mode)
1424 {
1425 WORD32 row, col;
1426 WORD32 two_nt;
1427 UNUSED(src_strd);
1428 UNUSED(mode);
1429
1430 two_nt = 2 * nt;
1431
1432 UWORD8 *pu1_dst_tmp = pu1_dst;
1433 UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
1434 uint8x8_t pu1_val_two_nt_1_col;
1435 if(nt == 32)
1436 {
1437 pu1_dst_tmp = pu1_dst;
1438 for(row = 0; row < nt; row++)
1439 {
1440 for(col = nt; col > 0; col -= 8)
1441 {
1442 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
1443 pu1_ref_tmp_1 += 8;
1444 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
1445 pu1_dst_tmp += 8;
1446 }
1447 pu1_ref_tmp_1 -= nt;
1448 pu1_dst_tmp += dst_strd - nt;
1449 }
1450 }
1451 else
1452
1453 {
1454 /* naming of variables made according to the operation(instructions) it performs */
1455 /* (eg. shift_val which contains the shifted value, */
1456 /* add_sat which has add and saturated value) */
1457 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1458 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
1459
1460 if(0 != (nt & 7))
1461 {
1462 WORD32 cond_4 = 0;
1463 UWORD8 *pu1_ref_val1 = pu1_ref;
1464 UWORD8 *pu1_ref_val2 = pu1_ref;
1465 UWORD8 *pu1_ref_val3 = pu1_ref;
1466
1467 UWORD8 *pu1_dst_val1 = pu1_dst;
1468 UWORD8 *pu1_dst_val2 = pu1_dst;
1469 UWORD8 *pu1_dst_val3 = pu1_dst;
1470
1471 uint8x8_t dup_2_sub, round_val, vext_val;
1472 uint16x8_t dup_2_add;
1473 uint32x2_t src_val1, src_val2, src_val3;
1474 uint16x8_t sub_val;
1475 int16x8_t shift_val1, add_sat;
1476 uint64x1_t shift_val2;
1477
1478 src_val1 = vdup_n_u32(0);
1479 src_val2 = vdup_n_u32(0);
1480 src_val3 = vdup_n_u32(0);
1481 pu1_ref_val1 += (two_nt - nt);
1482 pu1_ref_val3 += (two_nt + 2);
1483 pu1_ref_val2 += (two_nt + 1);
1484
1485 dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
1486 dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1487
1488 /* loops to store the first nt sets of values in the destination */
1489
1490 for(row = nt; row > 0; row -= 4)
1491 {
1492 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
1493 {
1494 /* unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
1495 src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
1496 sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
1497 shift_val1 = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1498 add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
1499 round_val = vqmovun_s16(add_sat);
1500
1501 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1502 src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
1503 vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
1504 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1505 pu1_dst_val1 += dst_strd;
1506
1507 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1508
1509 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1510 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1511 pu1_dst_val1 += dst_strd;
1512
1513 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1514
1515 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1516 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1517 pu1_dst_val1 += dst_strd;
1518
1519 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1520
1521 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1522 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1523 pu1_dst_val1 += dst_strd;
1524
1525 pu1_ref_val1 -= 4;
1526 }
1527
1528 /* loop to store next sets of eight values in the destination */
1529
1530 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
1531 {
1532 src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
1533
1534 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1535 pu1_dst_val2 += dst_strd;
1536
1537 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1538 pu1_dst_val2 += dst_strd;
1539
1540 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1541 pu1_dst_val2 += dst_strd;
1542
1543 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1544 pu1_dst_val2 += dst_strd;
1545 }
1546 pu1_ref_val2 += 4;
1547 pu1_dst_val3 += 4;
1548 pu1_dst_val2 = pu1_dst_val3;
1549 cond_4 = 1;
1550 }
1551 }
1552
1553 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1554 else
1555 {
1556 WORD32 cond = 0, col_1;
1557 UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1558 UWORD8 *pu1_dst_tmp_2 = pu1_dst;
1559 UWORD8 *pu1_dst_tmp_3 = pu1_dst;
1560
1561 UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1562 UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1563 UWORD8 *pu1_ref_tmp_3 = pu1_ref;
1564
1565 uint8x8_t pu1_src_tmp1;
1566 uint8x8_t pu1_src_tmp2;
1567
1568 uint8x8_t dup_sub;
1569 uint16x8_t dup_add;
1570 int16x8_t subsh_val;
1571 int16x8_t addsat_val;
1572 uint16x8_t sub_val;
1573 uint8x8_t round_val;
1574 uint8x8_t vext_t;
1575 uint64x1_t shift_64;
1576
1577 dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1578 dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1579
1580 pu1_ref_tmp_1 += (two_nt);
1581 pu1_ref_tmp_1 -= 8;
1582 pu1_ref_tmp_2 += (two_nt + 2);
1583 pu1_ref_tmp_3 += (two_nt + 1);
1584
1585 /* loops to store the first nt sets of values in the destination */
1586
1587 for(row = nt; row > 0; row -= 8)
1588 {
1589 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
1590 {
1591 pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
1592
1593 sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
1594 subsh_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1595 addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
1596 round_val = vqmovun_s16(addsat_val);
1597
1598 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1599
1600 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
1601 vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
1602 vst1_u8(pu1_dst_tmp_1, vext_t);
1603 pu1_dst_tmp_1 += dst_strd;
1604
1605 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1606
1607 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1608 vst1_u8(pu1_dst_tmp_1, vext_t);
1609 pu1_dst_tmp_1 += dst_strd;
1610
1611 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1612 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1613 vst1_u8(pu1_dst_tmp_1, vext_t);
1614 pu1_dst_tmp_1 += dst_strd;
1615
1616 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1617 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1618 vst1_u8(pu1_dst_tmp_1, vext_t);
1619 pu1_dst_tmp_1 += dst_strd;
1620
1621 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
1622 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1623 vst1_u8(pu1_dst_tmp_1, vext_t);
1624 pu1_dst_tmp_1 += dst_strd;
1625
1626 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
1627 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1628 vst1_u8(pu1_dst_tmp_1, vext_t);
1629 pu1_dst_tmp_1 += dst_strd;
1630
1631 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
1632 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1633 vst1_u8(pu1_dst_tmp_1, vext_t);
1634 pu1_dst_tmp_1 += dst_strd;
1635
1636 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
1637 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1638 vst1_u8(pu1_dst_tmp_1, vext_t);
1639 pu1_dst_tmp_1 += dst_strd;
1640
1641 pu1_ref_tmp_1 -= 8;
1642 }
1643
1644 /* loop to store next sets of eight values in the destination */
1645
1646 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
1647 {
1648 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
1649
1650 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1651 pu1_dst_tmp_2 += dst_strd;
1652
1653 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1654 pu1_dst_tmp_2 += dst_strd;
1655
1656 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1657 pu1_dst_tmp_2 += dst_strd;
1658
1659 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1660 pu1_dst_tmp_2 += dst_strd;
1661
1662 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1663 pu1_dst_tmp_2 += dst_strd;
1664
1665 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1666 pu1_dst_tmp_2 += dst_strd;
1667
1668 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1669 pu1_dst_tmp_2 += dst_strd;
1670
1671 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1672 pu1_dst_tmp_2 += dst_strd;
1673 }
1674 pu1_ref_tmp_3 += 8;
1675 pu1_dst_tmp_3 += 8;
1676 pu1_dst_tmp_2 = pu1_dst_tmp_3;
1677 cond = 1;
1678 }
1679 }
1680 }
1681 }
1682 /* INTRA_PRED_LUMA_VER */
1683
1684 /**
1685 *******************************************************************************
1686 *
1687 * @brief
1688 * Intra prediction interpolation filter for luma mode2.
1689 *
1690 * @par Description:
1691 * Intraprediction for mode 2 (sw angle) with reference neighboring samples
1692 * location pointed by 'pu1_ref' to the TU block location pointed by
1693 * 'pu1_dst'
1694 *
1695 * @param[in] pu1_src
1696 * UWORD8 pointer to the source
1697 *
1698 * @param[out] pu1_dst
1699 * UWORD8 pointer to the destination
1700 *
1701 * @param[in] src_strd
1702 * integer source stride
1703 *
1704 * @param[in] dst_strd
1705 * integer destination stride
1706 *
1707 * @param[in] nt
1708 * integer Transform Block size
1709 *
1710 * @param[in] wd
1711 * integer width of the array
1712 *
1713 * @returns
1714 *
1715 * @remarks
1716 * None
1717 *
1718 *******************************************************************************
1719 */
1720
ihevc_intra_pred_luma_mode2_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1721 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
1722 WORD32 src_strd,
1723 UWORD8 *pu1_dst,
1724 WORD32 dst_strd,
1725 WORD32 nt,
1726 WORD32 mode)
1727 {
1728
1729 WORD32 row, col;
1730 WORD32 two_nt;
1731 UNUSED(src_strd);
1732 UNUSED(mode);
1733
1734 /* rev_res naming has been made to have the reverse result value in it */
1735 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1736 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
1737
1738 if(0 != (nt & 7))
1739 {
1740 UWORD8 *pu1_ref_tmp = pu1_ref;
1741 UWORD8 *pu1_dst_tmp = pu1_dst;
1742 uint8x8_t pu1_src_val, rev_res;
1743 uint64x1_t shift_res;
1744
1745 for(col = nt; col > 0; col -= 4)
1746 {
1747 for(row = nt; row > 0; row -= 4)
1748 {
1749 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
1750
1751 pu1_src_val = vld1_u8(pu1_ref_tmp);
1752 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
1753 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
1754
1755 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
1756 pu1_dst_tmp += dst_strd;
1757
1758 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
1759 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1760 pu1_dst_tmp += dst_strd;
1761
1762 shift_res = vshr_n_u64(shift_res, 8);
1763 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1764 pu1_dst_tmp += dst_strd;
1765
1766 shift_res = vshr_n_u64(shift_res, 8);
1767 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1768 pu1_dst_tmp += dst_strd;
1769 }
1770 }
1771 }
1772
1773 /* rev_val_second, rev_val_first to reverse the loaded values in order to get the values in right order */
1774 /* shift_64 to shift the reversed 2nd values to get the value what we need */
1775 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1776
1777 else
1778 {
1779 UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
1780 UWORD8 *pu1_dst_tmp = pu1_dst;
1781 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1782
1783 uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
1784 uint64x1_t shift_val;
1785
1786 two_nt = 2 * nt;
1787 pu1_ref_two_nt_minus2 += (two_nt);
1788 pu1_ref_two_nt_minus2 -= 8;
1789
1790 for(col = nt; col > 0; col -= 8)
1791 {
1792 for(row = nt; row > 0; row -= 8)
1793 {
1794 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
1795 rev_val_first = vrev64_u8(pu1_src_val2);
1796
1797 pu1_ref_two_nt_minus2 -= 8;
1798 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
1799 rev_val_second = vrev64_u8(pu1_src_val1);
1800
1801 vext_t = vext_u8(rev_val_first, rev_val_second, 1);
1802 vst1_u8(pu1_dst_tmp, vext_t);
1803 pu1_dst_tmp += dst_strd;
1804
1805 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
1806 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1807 vst1_u8(pu1_dst_tmp, vext_t);
1808 pu1_dst_tmp += dst_strd;
1809
1810 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
1811 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1812 vst1_u8(pu1_dst_tmp, vext_t);
1813 pu1_dst_tmp += dst_strd;
1814
1815 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
1816 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1817 vst1_u8(pu1_dst_tmp, vext_t);
1818 pu1_dst_tmp += dst_strd;
1819
1820 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
1821 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1822 vst1_u8(pu1_dst_tmp, vext_t);
1823 pu1_dst_tmp += dst_strd;
1824
1825 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
1826 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1827 vst1_u8(pu1_dst_tmp, vext_t);
1828 pu1_dst_tmp += dst_strd;
1829
1830 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
1831 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1832 vst1_u8(pu1_dst_tmp, vext_t);
1833 pu1_dst_tmp += dst_strd;
1834
1835 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
1836 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1837 vst1_u8(pu1_dst_tmp, vext_t);
1838 pu1_dst_tmp += dst_strd;
1839 }
1840 pu1_dst_tmp_plus8 += 8;
1841 pu1_dst_tmp = pu1_dst_tmp_plus8;
1842 pu1_ref_two_nt_minus2 += (nt - 8);
1843 }
1844 }
1845 }
1846 /* INTRA_PRED_LUMA_MODE2 */
1847
1848 /**
1849 *******************************************************************************
1850 *
1851 * @brief
1852 * Intra prediction interpolation filter for luma mode 18 & mode 34.
1853 *
1854 * @par Description:
1855 * Intraprediction for mode 34 (ne angle) with reference neighboring
1856 * samples location pointed by 'pu1_ref' to the TU block location pointed by
1857 * 'pu1_dst'
1858 *
1859 * @param[in] pu1_src
1860 * UWORD8 pointer to the source
1861 *
1862 * @param[out] pu1_dst
1863 * UWORD8 pointer to the destination
1864 *
1865 * @param[in] src_strd
1866 * integer source stride
1867 *
1868 * @param[in] dst_strd
1869 * integer destination stride
1870 *
1871 * @param[in] nt
1872 * integer Transform Block size
1873 *
1874 * @param[in] wd
1875 * integer width of the array
1876 *
1877 * @returns
1878 *
1879 * @remarks
1880 * None
1881 *
1882 *******************************************************************************
1883 */
1884
ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1885 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
1886 WORD32 src_strd,
1887 UWORD8 *pu1_dst,
1888 WORD32 dst_strd,
1889 WORD32 nt,
1890 WORD32 mode)
1891 {
1892
1893 WORD32 row, col, idx;
1894 WORD32 intraPredAngle = 32;
1895 WORD32 two_nt;
1896 UNUSED(src_strd);
1897 two_nt = 2 * nt;
1898
1899 UWORD8 *pu1_ref_tmp = pu1_ref;
1900 UWORD8 *pu1_ref_tmp1 = pu1_ref;
1901 UWORD8 *pu1_dst_tmp = pu1_dst;
1902 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1903
1904 uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
1905
1906 /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref) */
1907 /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue */
1908 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1909 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1910 /* loops are maintained separately for mode18 and mode34 */
1911
1912 /* cond to allow multiples of 8 */
1913 if(0 == (nt & 7))
1914 {
1915 if(mode == 34)
1916 {
1917 pu1_ref_tmp += (two_nt + 2);
1918
1919 for(row = nt; row > 0; row -= 8)
1920 {
1921 for(col = nt; col > 0; col -= 8)
1922 {
1923 /* Loading 1st eight values */
1924 src_tmp_1st = vld1_u8(pu1_ref_tmp);
1925 pu1_ref_tmp += 8;
1926
1927 /* Loading next eight values */
1928 src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1929
1930 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1931 vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
1932 vst1_u8(pu1_dst_tmp, src_tmp_1st);
1933 pu1_dst_tmp += dst_strd;
1934
1935 vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
1936 vst1_u8(pu1_dst_tmp, vext1);
1937 pu1_dst_tmp += dst_strd;
1938
1939 vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
1940 vst1_u8(pu1_dst_tmp, vext2);
1941 pu1_dst_tmp += dst_strd;
1942
1943 vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
1944 vst1_u8(pu1_dst_tmp, vext3);
1945 pu1_dst_tmp += dst_strd;
1946
1947 vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
1948 vst1_u8(pu1_dst_tmp, vext4);
1949 pu1_dst_tmp += dst_strd;
1950
1951 vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
1952 vst1_u8(pu1_dst_tmp, vext5);
1953 pu1_dst_tmp += dst_strd;
1954
1955 vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
1956 vst1_u8(pu1_dst_tmp, vext6);
1957 pu1_dst_tmp += dst_strd;
1958
1959 vst1_u8(pu1_dst_tmp, vext7);
1960 pu1_dst_tmp += dst_strd;
1961 }
1962
1963 pu1_dst_tmp_plus8 += 8;
1964 pu1_dst_tmp = pu1_dst_tmp_plus8;
1965 pu1_ref_tmp -= (nt - 8);
1966 }
1967 }
1968 else /* Loop for mode 18 */
1969 {
1970 pu1_ref_tmp += (two_nt);
1971
1972 for(row = nt; row > 0; row -= 8)
1973 {
1974 for(col = nt; col > 0; col -= 8)
1975 {
1976 /* Loading 1st eight values */
1977 src_tmp_1st = vld1_u8(pu1_ref_tmp);
1978 pu1_ref_tmp -= 8;
1979
1980 /* Loading next eight values */
1981 src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1982
1983 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1984 vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
1985 vst1_u8(pu1_dst_tmp, src_tmp_1st);
1986 pu1_dst_tmp += dst_strd;
1987
1988 vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
1989 vst1_u8(pu1_dst_tmp, vext1);
1990 pu1_dst_tmp += dst_strd;
1991
1992 vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
1993 vst1_u8(pu1_dst_tmp, vext2);
1994 pu1_dst_tmp += dst_strd;
1995
1996 vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
1997 vst1_u8(pu1_dst_tmp, vext3);
1998 pu1_dst_tmp += dst_strd;
1999
2000 vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
2001 vst1_u8(pu1_dst_tmp, vext4);
2002 pu1_dst_tmp += dst_strd;
2003
2004 vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
2005 vst1_u8(pu1_dst_tmp, vext5);
2006 pu1_dst_tmp += dst_strd;
2007
2008 vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
2009 vst1_u8(pu1_dst_tmp, vext6);
2010 pu1_dst_tmp += dst_strd;
2011
2012 vst1_u8(pu1_dst_tmp, vext7);
2013 pu1_dst_tmp += dst_strd;
2014 }
2015 pu1_dst_tmp_plus8 += 8;
2016 pu1_dst_tmp = pu1_dst_tmp_plus8;
2017 pu1_ref_tmp += (nt + 8);
2018 }
2019 }
2020 }
2021
2022 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
2023
2024 else /* loop for multiples of 4 */
2025 {
2026 uint8x8_t src_val1;
2027 uint8x8_t src_val2;
2028
2029 if(mode == 18)
2030 intraPredAngle = -32;
2031 else if(mode == 34)
2032 intraPredAngle = 32;
2033
2034 for(row = 0; row < nt; row += 2)
2035 {
2036 /* unrolling 2 rows */
2037 idx = ((row + 1) * intraPredAngle) >> 5;
2038 pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
2039 src_val1 = vld1_u8(pu1_ref_tmp);
2040
2041 idx = ((row + 2) * intraPredAngle) >> 5;
2042 pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
2043 src_val2 = vld1_u8(pu1_ref_tmp1);
2044
2045 /* unrolling 4 col */
2046 for(col = nt; col > 0; col -= 4)
2047 {
2048 pu1_dst_tmp = pu1_dst;
2049 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
2050 pu1_dst_tmp += dst_strd;
2051 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
2052 pu1_dst += 4;
2053 }
2054 pu1_dst += 2 * dst_strd - nt;
2055 }
2056 }
2057 }
2058 /* INTRA_PRED_LUMA_MODE_18_34 */
2059
2060 /**
2061 *******************************************************************************
2062 *
2063 * @brief
2064 * Intra prediction interpolation filter for luma mode 3 to mode 9
2065 *
2066 * @par Description:
2067 * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
2068 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2069 * block location pointed by 'pu1_dst'
2070 *
2071 * @param[in] pu1_src
2072 * UWORD8 pointer to the source
2073 *
2074 * @param[out] pu1_dst
2075 * UWORD8 pointer to the destination
2076 *
2077 * @param[in] src_strd
2078 * integer source stride
2079 *
2080 * @param[in] dst_strd
2081 * integer destination stride
2082 *
2083 * @param[in] nt
2084 * integer Transform Block size
2085 *
2086 * @param[in] mode
2087 * integer intraprediction mode
2088 *
2089 * @returns
2090 *
2091 * @remarks
2092 * None
2093 *
2094 *******************************************************************************
2095 */
2096
2097
ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2098 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
2099 WORD32 src_strd,
2100 UWORD8 *pu1_dst,
2101 WORD32 dst_strd,
2102 WORD32 nt,
2103 WORD32 mode)
2104 {
2105
2106 WORD32 row, col;
2107 WORD32 intra_pred_ang;
2108 WORD32 pos, fract = 100, fract_prev;
2109 UNUSED(src_strd);
2110 if(0 == (nt & 7))
2111 {
2112
2113 UWORD8 *pu1_ref_main_idx = pu1_ref;
2114 UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2115
2116 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2117 UWORD8 *pu1_dst_tmp2 = pu1_dst;
2118
2119 WORD32 two_nt = 2 * nt;
2120
2121 pu1_ref_main_idx += two_nt;
2122 pu1_ref_main_idx_1 += two_nt - 1;
2123
2124 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2125 uint8x8_t shift_res;
2126 uint16x8_t mul_res1, mul_res2, add_res;
2127
2128 /* Intra Pred Angle according to the mode */
2129 intra_pred_ang = gai4_ihevc_ang_table[mode];
2130
2131 pu1_ref_main_idx -= 8;
2132 pu1_ref_main_idx_1 -= 8;
2133
2134 for(col = 0; col < nt; col++)
2135 {
2136 fract_prev = fract;
2137
2138 pos = ((col + 1) * intra_pred_ang);
2139 fract = pos & (31);
2140
2141 if(fract_prev < fract)
2142 {
2143 pu1_ref_main_idx += 1;
2144 pu1_ref_main_idx_1 += 1;
2145 }
2146
2147 dup_const_fract = vdup_n_u8((uint8_t)fract);
2148 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2149
2150 for(row = nt; row > 0; row -= 8)
2151 {
2152 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2153 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2154
2155 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2156 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2157
2158 add_res = vaddq_u16(mul_res1, mul_res2);
2159
2160 shift_res = vrshrn_n_u16(add_res, 5);
2161
2162 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2163 pu1_dst_tmp1 += dst_strd;
2164
2165 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2166 pu1_dst_tmp1 += dst_strd;
2167
2168 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2169 pu1_dst_tmp1 += dst_strd;
2170
2171 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2172 pu1_dst_tmp1 += dst_strd;
2173
2174 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2175 pu1_dst_tmp1 += dst_strd;
2176
2177 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2178 pu1_dst_tmp1 += dst_strd;
2179
2180 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2181 pu1_dst_tmp1 += dst_strd;
2182
2183 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2184 pu1_dst_tmp1 += dst_strd;
2185
2186 pu1_ref_main_idx -= 8;
2187 pu1_ref_main_idx_1 -= 8;
2188
2189 }
2190 pu1_dst_tmp2 += 1;
2191 pu1_dst_tmp1 = pu1_dst_tmp2;
2192
2193 pu1_ref_main_idx += nt;
2194 pu1_ref_main_idx_1 += nt;
2195
2196 pu1_ref_main_idx -= 1;
2197 pu1_ref_main_idx_1 -= 1;
2198
2199 }
2200 }
2201 else
2202 {
2203 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2204 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2205 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2206 UWORD8 *pu1_dst_tmp2 = pu1_dst;
2207
2208 pu1_ref_tmp1 += nt;
2209 pu1_ref_tmp2 += (nt - 1);
2210
2211 uint8x8_t dup_fract, dup_32_fract, shift_res;
2212 uint16x8_t mul_res1, mul_res2, add_res;
2213 uint32x2_t pu1_ref_val1, pu1_ref_val2;
2214
2215 pu1_ref_val1 = vdup_n_u32(0);
2216 pu1_ref_val2 = vdup_n_u32(0);
2217
2218 /* Intra Pred Angle according to the mode */
2219 intra_pred_ang = gai4_ihevc_ang_table[mode];
2220
2221
2222 for(col = 0; col < nt; col++)
2223 {
2224 fract_prev = fract;
2225 pos = ((col + 1) * intra_pred_ang);
2226 fract = pos & (31);
2227 if(fract_prev < fract)
2228 {
2229 pu1_ref_tmp1 += 1;
2230 pu1_ref_tmp2 += 1;
2231 }
2232 dup_fract = vdup_n_u8((uint8_t)fract);
2233 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2234
2235 for(row = nt; row > 0; row -= 4)
2236 {
2237 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2238 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2239
2240 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2241 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2242
2243 add_res = vaddq_u16(mul_res1, mul_res2);
2244
2245 shift_res = vrshrn_n_u16(add_res, 5);
2246
2247 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2248 pu1_dst_tmp1 += dst_strd;
2249
2250 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2251 pu1_dst_tmp1 += dst_strd;
2252
2253 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2254 pu1_dst_tmp1 += dst_strd;
2255
2256 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2257
2258 }
2259 pu1_ref_tmp1 -= 1;
2260 pu1_ref_tmp2 -= 1;
2261
2262 pu1_dst_tmp2 += 1;
2263 pu1_dst_tmp1 = pu1_dst_tmp2;
2264
2265 }
2266
2267
2268 }
2269
2270 }
2271
2272 /**
2273 *******************************************************************************
2274 *
2275 * @brief
2276 * Intra prediction interpolation filter for luma mode 11 to mode 17
2277 *
2278 * @par Description:
2279 * Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
2280 * with reference neighboring samples location pointed by 'pu1_ref' to the
2281 * TU block location pointed by 'pu1_dst'
2282 *
2283 * @param[in] pu1_src
2284 * UWORD8 pointer to the source
2285 *
2286 * @param[out] pu1_dst
2287 * UWORD8 pointer to the destination
2288 *
2289 * @param[in] src_strd
2290 * integer source stride
2291 *
2292 * @param[in] dst_strd
2293 * integer destination stride
2294 *
2295 * @param[in] nt
2296 * integer Transform Block size
2297 *
2298 * @param[in] mode
2299 * integer intraprediction mode
2300 *
2301 * @returns
2302 *
2303 * @remarks
2304 * None
2305 *
2306 *******************************************************************************
2307 */
2308
2309
ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2310 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
2311 WORD32 src_strd,
2312 UWORD8 *pu1_dst,
2313 WORD32 dst_strd,
2314 WORD32 nt,
2315 WORD32 mode)
2316 {
2317
2318 WORD32 row, col, k;
2319 WORD32 two_nt;
2320 WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
2321 WORD32 pos, fract = 1000, fract_prev;
2322 WORD32 ref_idx;
2323
2324 UWORD8 *ref_main;
2325 UWORD8 *ref_main_tmp;
2326
2327 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2328 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2329 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2330 UWORD8 *pu1_dst_tmp2 = pu1_dst;
2331
2332 UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
2333
2334 uint16x8_t mul_res1, mul_res2, add_res;
2335 uint8x8_t dup_const_fract, dup_const_32_fract;
2336 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2337 uint8x8_t ref_left_t;
2338 uint32x2_t ref_left_tmp;
2339 UNUSED(src_strd);
2340 ref_left_tmp = vdup_n_u32(0);
2341
2342 inv_ang_sum = 128;
2343 two_nt = 2 * nt;
2344
2345 intra_pred_ang = gai4_ihevc_ang_table[mode];
2346
2347 inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
2348
2349 pu1_ref_tmp1 += two_nt;
2350
2351 ref_main = ref_temp + (nt - 1);
2352 ref_main_tmp = ref_main;
2353
2354 if(0 == (nt & 7))
2355 {
2356 pu1_ref_tmp2 += (two_nt - 7);
2357
2358 for(k = nt - 1; k >= 0; k -= 8)
2359 {
2360
2361 ref_left_t = vld1_u8(pu1_ref_tmp2);
2362
2363 ref_left_t = vrev64_u8(ref_left_t);
2364 vst1_u8(ref_main_tmp, ref_left_t);
2365 ref_main_tmp += 8;
2366 pu1_ref_tmp2 -= 8;
2367
2368 }
2369
2370 }
2371 else
2372 {
2373 uint8x8_t rev_val;
2374 pu1_ref_tmp2 += (two_nt - (nt - 1));
2375
2376 for(k = nt - 1; k >= 0; k -= 8)
2377 {
2378
2379 ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
2380
2381 rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
2382 vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
2383
2384 }
2385
2386 }
2387
2388 ref_main[nt] = pu1_ref[two_nt - nt];
2389
2390 /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
2391
2392 ref_idx = (nt * intra_pred_ang) >> 5;
2393
2394 /* SIMD Optimization can be done using look-up table for the loop */
2395 /* For negative angled derive the main reference samples from side */
2396 /* reference samples refer to section 8.4.4.2.6 */
2397 for(k = -1; k > ref_idx; k--)
2398 {
2399 inv_ang_sum += inv_ang;
2400 ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
2401 }
2402
2403 UWORD8 *ref_main_tmp1 = ref_main;
2404 UWORD8 *ref_main_tmp2 = ref_main;
2405
2406 ref_main_tmp2 += 1;
2407
2408 if(0 == (nt & 7))
2409 {
2410 /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2411 /* samples dependent on distance to obtain destination sample */
2412 for(col = 0; col < nt; col++)
2413 {
2414
2415 fract_prev = fract;
2416 pos = ((col + 1) * intra_pred_ang);
2417 fract = pos & (31);
2418
2419 if(fract_prev < fract)
2420 {
2421 ref_main_tmp1 -= 1;
2422 ref_main_tmp2 -= 1;
2423 }
2424
2425 dup_const_fract = vdup_n_u8((uint8_t)fract);
2426 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2427
2428 // Do linear filtering
2429 for(row = nt; row > 0; row -= 8)
2430 {
2431 ref_main_idx = vld1_u8(ref_main_tmp1);
2432
2433 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2434
2435 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2436 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2437
2438 add_res = vaddq_u16(mul_res1, mul_res2);
2439
2440 shift_res = vrshrn_n_u16(add_res, 5);
2441
2442 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2443 pu1_dst_tmp1 += dst_strd;
2444
2445 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2446 pu1_dst_tmp1 += dst_strd;
2447
2448 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2449 pu1_dst_tmp1 += dst_strd;
2450
2451 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2452 pu1_dst_tmp1 += dst_strd;
2453
2454 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2455 pu1_dst_tmp1 += dst_strd;
2456
2457 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2458 pu1_dst_tmp1 += dst_strd;
2459
2460 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2461 pu1_dst_tmp1 += dst_strd;
2462
2463 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2464 pu1_dst_tmp1 += dst_strd;
2465
2466 ref_main_tmp1 += 8;
2467 ref_main_tmp2 += 8;
2468 }
2469
2470 ref_main_tmp1 -= nt;
2471 ref_main_tmp2 -= nt;
2472
2473 pu1_dst_tmp2 += 1;
2474 pu1_dst_tmp1 = pu1_dst_tmp2;
2475 }
2476 }
2477 else
2478 {
2479 uint32x2_t ref_main_idx1, ref_main_idx2;
2480
2481 ref_main_idx1 = vdup_n_u32(0);
2482 ref_main_idx2 = vdup_n_u32(0);
2483
2484 for(col = 0; col < nt; col++)
2485 {
2486 fract_prev = fract;
2487 pos = ((col + 1) * intra_pred_ang);
2488 fract = pos & (31);
2489
2490 if(fract_prev < fract)
2491 {
2492 ref_main_tmp1 -= 1;
2493 ref_main_tmp2 -= 1;
2494 }
2495
2496 dup_const_fract = vdup_n_u8((uint8_t)fract);
2497 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2498
2499 for(row = nt; row > 0; row -= 4)
2500 {
2501
2502 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2503 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2504
2505 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2506 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2507
2508 add_res = vaddq_u16(mul_res1, mul_res2);
2509
2510 shift_res = vrshrn_n_u16(add_res, 5);
2511
2512 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2513 pu1_dst_tmp1 += dst_strd;
2514
2515 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2516 pu1_dst_tmp1 += dst_strd;
2517
2518 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2519 pu1_dst_tmp1 += dst_strd;
2520
2521 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2522 pu1_dst_tmp1 += dst_strd;
2523
2524 }
2525
2526 pu1_dst_tmp2 += 1;
2527 pu1_dst_tmp1 = pu1_dst_tmp2;
2528
2529 }
2530
2531 }
2532 }
2533
2534 /**
2535 *******************************************************************************
2536 *
2537 * @brief
2538 * Intra prediction interpolation filter for luma mode 19 to mode 25
2539 *
2540 * @par Description:
2541 * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
2542 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2543 * block location pointed by 'pu1_dst'
2544 *
2545 * @param[in] pu1_src
2546 * UWORD8 pointer to the source
2547 *
2548 * @param[out] pu1_dst
2549 * UWORD8 pointer to the destination
2550 *
2551 * @param[in] src_strd
2552 * integer source stride
2553 *
2554 * @param[in] dst_strd
2555 * integer destination stride
2556 *
2557 * @param[in] nt
2558 * integer Transform Block size
2559 *
2560 * @param[in] mode
2561 * integer intraprediction mode
2562 *
2563 * @returns
2564 *
2565 * @remarks
2566 * None
2567 *
2568 *******************************************************************************
2569 */
2570
2571
ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2572 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
2573 WORD32 src_strd,
2574 UWORD8 *pu1_dst,
2575 WORD32 dst_strd,
2576 WORD32 nt,
2577 WORD32 mode)
2578 {
2579
2580 WORD32 row, col, k;
2581 WORD32 two_nt, intra_pred_ang;
2582 WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
2583 WORD32 ref_idx;
2584 UWORD8 *ref_main;
2585 UWORD8 *ref_main_tmp;
2586 UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
2587
2588 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2589 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2590 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2591
2592 uint16x8_t mul_res1, mul_res2, add_res;
2593 uint8x8_t dup_const_fract, dup_const_32_fract;
2594 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2595 uint8x8_t ref_above_t;
2596 uint32x2_t ref_above_tmp;
2597 UNUSED(src_strd);
2598 ref_above_tmp = vdup_n_u32(0);
2599
2600 two_nt = 2 * nt;
2601 intra_pred_ang = gai4_ihevc_ang_table[mode];
2602 inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
2603
2604 /* Intermediate reference samples for negative angle modes */
2605 /* This have to be removed during optimization*/
2606 pu1_ref_tmp1 += two_nt;
2607
2608
2609 ref_main = ref_temp + (nt - 1);
2610 ref_main_tmp = ref_main;
2611
2612 if(0 == (nt & 7))
2613 {
2614 pu1_ref_tmp2 += (two_nt - 7);
2615 for(k = nt - 1; k >= 0; k -= 8)
2616 {
2617
2618 ref_above_t = vld1_u8(pu1_ref_tmp1);
2619 vst1_u8(ref_main_tmp, ref_above_t);
2620 ref_main_tmp += 8;
2621 pu1_ref_tmp1 += 8;
2622
2623 }
2624
2625 }
2626 else
2627 {
2628 pu1_ref_tmp2 += (two_nt - (nt - 1));
2629
2630 for(k = nt - 1; k >= 0; k -= 4)
2631 {
2632
2633 ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
2634 vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
2635
2636 }
2637
2638 }
2639
2640 ref_main[nt] = pu1_ref[two_nt + nt];
2641
2642 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
2643
2644 ref_idx = (nt * intra_pred_ang) >> 5;
2645 inv_ang_sum = 128;
2646
2647 /* SIMD Optimization can be done using look-up table for the loop */
2648 /* For negative angled derive the main reference samples from side */
2649 /* reference samples refer to section 8.4.4.2.6 */
2650 for(k = -1; k > ref_idx; k--)
2651 {
2652 inv_ang_sum += inv_ang;
2653 ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
2654 }
2655
2656 UWORD8 *ref_main_tmp1 = ref_main;
2657 UWORD8 *ref_main_tmp2 = ref_main;
2658
2659 ref_main_tmp2 += 1;
2660
2661 if(0 == (nt & 7))
2662 {
2663 /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2664 /* samples dependent on distance to obtain destination sample */
2665 for(row = 0; row < nt; row++)
2666 {
2667
2668 fract_prev = fract;
2669 pos = ((row + 1) * intra_pred_ang);
2670 fract = pos & (31);
2671
2672 if(fract_prev < fract)
2673 {
2674 ref_main_tmp1 -= 1;
2675 ref_main_tmp2 -= 1;
2676 }
2677
2678 dup_const_fract = vdup_n_u8((uint8_t)fract);
2679 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2680
2681 // Do linear filtering
2682 for(col = nt; col > 0; col -= 8)
2683 {
2684 ref_main_idx = vld1_u8(ref_main_tmp1);
2685
2686 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2687
2688 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2689 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2690
2691 add_res = vaddq_u16(mul_res1, mul_res2);
2692
2693 shift_res = vrshrn_n_u16(add_res, 5);
2694
2695 vst1_u8(pu1_dst_tmp1, shift_res);
2696 pu1_dst_tmp1 += 8;
2697
2698 ref_main_tmp1 += 8;
2699 ref_main_tmp2 += 8;
2700 }
2701
2702 ref_main_tmp1 -= nt;
2703 ref_main_tmp2 -= nt;
2704
2705 pu1_dst_tmp1 += (dst_strd - nt);
2706 }
2707 }
2708 else
2709 {
2710 uint32x2_t ref_main_idx1, ref_main_idx2;
2711
2712 ref_main_idx1 = vdup_n_u32(0);
2713 ref_main_idx2 = vdup_n_u32(0);
2714
2715 for(row = 0; row < nt; row++)
2716 {
2717 fract_prev = fract;
2718 pos = ((row + 1) * intra_pred_ang);
2719 fract = pos & (31);
2720
2721 if(fract_prev < fract)
2722 {
2723 ref_main_tmp1 -= 1;
2724 ref_main_tmp2 -= 1;
2725 }
2726
2727 dup_const_fract = vdup_n_u8((uint8_t)fract);
2728 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2729
2730 for(col = nt; col > 0; col -= 4)
2731 {
2732
2733 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2734 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2735
2736 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2737 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2738
2739 add_res = vaddq_u16(mul_res1, mul_res2);
2740
2741 shift_res = vrshrn_n_u16(add_res, 5);
2742
2743 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2744 pu1_dst_tmp1 += 4;
2745
2746 }
2747 pu1_dst_tmp1 += (dst_strd - nt);
2748 }
2749
2750 }
2751
2752 }
2753
2754 /**
2755 *******************************************************************************
2756 *
2757 * @brief
2758 * Intra prediction interpolation filter for luma mode 27 to mode 33
2759 *
2760 * @par Description:
2761 * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
2762 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2763 * block location pointed by 'pu1_dst'
2764 *
2765 * @param[in] pu1_src
2766 * UWORD8 pointer to the source
2767 *
2768 * @param[out] pu1_dst
2769 * UWORD8 pointer to the destination
2770 *
2771 * @param[in] src_strd
2772 * integer source stride
2773 *
2774 * @param[in] dst_strd
2775 * integer destination stride
2776 *
2777 * @param[in] nt
2778 * integer Transform Block size
2779 *
2780 * @param[in] mode
2781 * integer intraprediction mode
2782 *
2783 * @returns
2784 *
2785 * @remarks
2786 * None
2787 *
2788 *******************************************************************************
2789 */
2790
2791
ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2792 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
2793 WORD32 src_strd,
2794 UWORD8 *pu1_dst,
2795 WORD32 dst_strd,
2796 WORD32 nt,
2797 WORD32 mode)
2798 {
2799
2800 WORD32 row, col;
2801 WORD32 intra_pred_ang;
2802 WORD32 pos, fract = 0, fract_prev;
2803
2804 WORD32 two_nt = 2 * nt;
2805 UNUSED(src_strd);
2806 if(0 == (nt & 7))
2807 {
2808
2809 UWORD8 *pu1_ref_main_idx = pu1_ref;
2810 UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2811
2812 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2813 pu1_ref_main_idx += (two_nt + 1);
2814 pu1_ref_main_idx_1 += (two_nt + 2);
2815
2816 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2817 uint8x8_t shift_res;
2818 uint16x8_t mul_res1, mul_res2, add_res;
2819
2820 /* Intra Pred Angle according to the mode */
2821 intra_pred_ang = gai4_ihevc_ang_table[mode];
2822
2823 for(row = 0; row < nt; row++)
2824 {
2825 fract_prev = fract;
2826
2827 pos = ((row + 1) * intra_pred_ang);
2828 fract = pos & (31);
2829
2830 if(fract_prev > fract)
2831 {
2832 pu1_ref_main_idx += 1;
2833 pu1_ref_main_idx_1 += 1;
2834 }
2835
2836 dup_const_fract = vdup_n_u8((uint8_t)fract);
2837 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2838
2839 for(col = nt; col > 0; col -= 8)
2840 {
2841 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2842 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2843
2844 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2845 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2846
2847 add_res = vaddq_u16(mul_res1, mul_res2);
2848
2849 shift_res = vrshrn_n_u16(add_res, 5);
2850
2851 vst1_u8(pu1_dst_tmp1, shift_res);
2852 pu1_dst_tmp1 += 8;
2853
2854 pu1_ref_main_idx += 8;
2855 pu1_ref_main_idx_1 += 8;
2856 }
2857
2858 pu1_ref_main_idx -= nt;
2859 pu1_ref_main_idx_1 -= nt;
2860
2861 pu1_dst_tmp1 += (dst_strd - nt);
2862 }
2863
2864 }
2865 else
2866 {
2867 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2868 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2869 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2870
2871 pu1_ref_tmp1 += (two_nt + 1);;
2872 pu1_ref_tmp2 += (two_nt + 2);;
2873
2874 uint8x8_t dup_fract, dup_32_fract, shift_res;
2875 uint16x8_t mul_res1, mul_res2, add_res;
2876 uint32x2_t pu1_ref_val1, pu1_ref_val2;
2877
2878 pu1_ref_val1 = vdup_n_u32(0);
2879 pu1_ref_val2 = vdup_n_u32(0);
2880
2881 /* Intra Pred Angle according to the mode */
2882 intra_pred_ang = gai4_ihevc_ang_table[mode];
2883
2884 for(row = 0; row < nt; row++)
2885 {
2886 fract_prev = fract;
2887 pos = ((row + 1) * intra_pred_ang);
2888 fract = pos & (31);
2889 if(fract_prev > fract)
2890 {
2891 pu1_ref_tmp1 += 1;
2892 pu1_ref_tmp2 += 1;
2893 }
2894 dup_fract = vdup_n_u8((uint8_t)fract);
2895 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2896
2897 for(col = nt; col > 0; col -= 4)
2898 {
2899 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2900 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2901
2902 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2903 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2904
2905 add_res = vaddq_u16(mul_res1, mul_res2);
2906
2907 shift_res = vrshrn_n_u16(add_res, 5);
2908
2909 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2910 pu1_dst_tmp1 += 4;
2911
2912 }
2913
2914 pu1_dst_tmp1 += (dst_strd - nt);
2915
2916 }
2917
2918
2919 }
2920
2921 }
2922