1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21  *******************************************************************************
22  * @file
23  *  ih264_inter_pred_filters.c
24  *
25  * @brief
26  *  Contains function definitions for inter prediction interpolation filters
27  *
28  * @author
29  *  Ittiam
30  *
31  * @par List of Functions:
32  *  - ih264_inter_pred_luma_copy
33  *  - ih264_interleave_copy
34  *  - ih264_inter_pred_luma_horz
35  *  - ih264_inter_pred_luma_vert
36  *  - ih264_inter_pred_luma_horz_hpel_vert_hpel
37  *  - ih264_inter_pred_luma_horz_qpel
38  *  - ih264_inter_pred_luma_vert_qpel
39  *  - ih264_inter_pred_luma_horz_qpel_vert_qpel
40  *  - ih264_inter_pred_luma_horz_hpel_vert_qpel
41  *  - ih264_inter_pred_luma_horz_qpel_vert_hpel
42  *  - ih264_inter_pred_luma_bilinear
43  *  - ih264_inter_pred_chroma
44  *
45  * @remarks
46  *  None
47  *
48  *******************************************************************************
49  */
50 
51 /*****************************************************************************/
52 /* File Includes                                                             */
53 /*****************************************************************************/
54 
55 /* User include files */
56 #include "ih264_typedefs.h"
57 #include "ih264_macros.h"
58 #include "ih264_platform_macros.h"
59 #include "ih264_inter_pred_filters.h"
60 
61 
62 /*****************************************************************************/
63 /* Constant Data variables                                                   */
64 /*****************************************************************************/
65 
66 /* coefficients for 6 tap filtering*/
67 const WORD32 ih264_g_six_tap[3] ={1,-5,20};
68 
69 
70 /*****************************************************************************/
71 /*  Function definitions .                                                   */
72 /*****************************************************************************/
73 /**
74  *******************************************************************************
75  *
76  * @brief
77  * Interprediction luma function for copy
78  *
79  * @par Description:
80  *    Copies the array of width 'wd' and height 'ht' from the  location pointed
81  *    by 'src' to the location pointed by 'dst'
82  *
83  * @param[in] pu1_src
84  *  UWORD8 pointer to the source
85  *
86  * @param[out] pu1_dst
87  *  UWORD8 pointer to the destination
88  *
89  * @param[in] src_strd
90  *  integer source stride
91  *
92  * @param[in] dst_strd
93  *  integer destination stride
94  *
95  *
96  * @param[in] ht
97  *  integer height of the array
98  *
99  * @param[in] wd
100  *  integer width of the array
101  *
102  * @returns
103  *
104  * @remarks
105  *  None
106  *
107  *******************************************************************************
108  */
109 
ih264_inter_pred_luma_copy(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd,UWORD8 * pu1_tmp,WORD32 dydx)110 void ih264_inter_pred_luma_copy(UWORD8 *pu1_src,
111                                 UWORD8 *pu1_dst,
112                                 WORD32 src_strd,
113                                 WORD32 dst_strd,
114                                 WORD32 ht,
115                                 WORD32 wd,
116                                 UWORD8* pu1_tmp,
117                                 WORD32 dydx)
118 {
119     WORD32 row, col;
120     UNUSED(pu1_tmp);
121     UNUSED(dydx);
122     for(row = 0; row < ht; row++)
123     {
124         for(col = 0; col < wd; col++)
125         {
126             pu1_dst[col] = pu1_src[col];
127         }
128 
129         pu1_src += src_strd;
130         pu1_dst += dst_strd;
131     }
132 }
133 
134 /**
135  *******************************************************************************
136  *
137  * @brief
138  * Fucntion for copying to an interleaved destination
139  *
140  * @par Description:
141  *    Copies the array of width 'wd' and height 'ht' from the  location pointed
142  *    by 'src' to the location pointed by 'dst'
143  *
144  * @param[in] pu1_src
145  *  UWORD8 pointer to the source
146  *
147  * @param[out] pu1_dst
148  *  UWORD8 pointer to the destination
149  *
150  * @param[in] src_strd
151  *  integer source stride
152  *
153  * @param[in] dst_strd
154  *  integer destination stride
155  *
156  * @param[in] ht
157  *  integer height of the array
158  *
159  * @param[in] wd
160  *  integer width of the array
161  *
162  * @returns
163  *
164  * @remarks
165  *  The alternate elements of src will be copied to alternate locations in dsr
166  *  Other locations are not touched
167  *
168  *******************************************************************************
169  */
ih264_interleave_copy(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd)170 void ih264_interleave_copy(UWORD8 *pu1_src,
171                            UWORD8 *pu1_dst,
172                            WORD32 src_strd,
173                            WORD32 dst_strd,
174                            WORD32 ht,
175                            WORD32 wd)
176 {
177     WORD32 row, col;
178     wd *= 2;
179 
180     for(row = 0; row < ht; row++)
181     {
182         for(col = 0; col < wd; col+=2)
183         {
184             pu1_dst[col] = pu1_src[col];
185         }
186 
187         pu1_src += src_strd;
188         pu1_dst += dst_strd;
189     }
190 }
191 
192 /**
193  *******************************************************************************
194  *
195  * @brief
196  *     Interprediction luma filter for horizontal input
197  *
198  * @par Description:
199  *    Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
200  *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
201  *
202  * @param[in] pu1_src
203  *  UWORD8 pointer to the source
204  *
205  * @param[out] pu1_dst
206  *  UWORD8 pointer to the destination
207  *
208  * @param[in] src_strd
209  *  integer source stride
210  *
211  * @param[in] dst_strd
212  *  integer destination stride
213  *
214  * @param[in] ht
215  *  integer height of the array
216  *
217  * @param[in] wd
218  *  integer width of the array
219  *
220  * @returns
221  *
222  * @remarks
223  *  None
224  *
225  *******************************************************************************
226  */
ih264_inter_pred_luma_horz(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd,UWORD8 * pu1_tmp,WORD32 dydx)227 void ih264_inter_pred_luma_horz(UWORD8 *pu1_src,
228                                 UWORD8 *pu1_dst,
229                                 WORD32 src_strd,
230                                 WORD32 dst_strd,
231                                 WORD32 ht,
232                                 WORD32 wd,
233                                 UWORD8* pu1_tmp,
234                                 WORD32 dydx)
235 {
236     WORD32 row, col;
237     WORD16 i2_tmp;
238     UNUSED(pu1_tmp);
239     UNUSED(dydx);
240 
241     for(row = 0; row < ht; row++)
242     {
243         for(col = 0; col < wd; col++)
244         {
245             i2_tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
246             i2_tmp = ih264_g_six_tap[0] *
247                             (pu1_src[col - 2] + pu1_src[col + 3])
248                      + ih264_g_six_tap[1] *
249                             (pu1_src[col - 1] + pu1_src[col + 2])
250                      + ih264_g_six_tap[2] *
251                             (pu1_src[col] + pu1_src[col + 1]);
252             i2_tmp = (i2_tmp + 16) >> 5;
253             pu1_dst[col] = CLIP_U8(i2_tmp);
254         }
255 
256         pu1_src += src_strd;
257         pu1_dst += dst_strd;
258     }
259 
260 }
261 
262 /**
263  *******************************************************************************
264  *
265  * @brief
266  *    Interprediction luma filter for vertical input
267  *
268  * @par Description:
269  *   Applies a 6 tap vertical filter.The output is  clipped to 8 bits
270  *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
271  *
272  * @param[in] pu1_src
273  *  UWORD8 pointer to the source
274  *
275  * @param[out] pu1_dst
276  *  UWORD8 pointer to the destination
277  *
278  * @param[in] src_strd
279  *  integer source stride
280  *
281  * @param[in] dst_strd
282  *  integer destination stride
283  *
284  * @param[in] ht
285  *  integer height of the array
286  *
287  * @param[in] wd
288  *  integer width of the array
289  *
290  * @returns
291  *
292  * @remarks
293  *  None
294  *
295  *******************************************************************************
296  */
ih264_inter_pred_luma_vert(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd,UWORD8 * pu1_tmp,WORD32 dydx)297 void ih264_inter_pred_luma_vert(UWORD8 *pu1_src,
298                                 UWORD8 *pu1_dst,
299                                 WORD32 src_strd,
300                                 WORD32 dst_strd,
301                                 WORD32 ht,
302                                 WORD32 wd,
303                                 UWORD8* pu1_tmp,
304                                 WORD32 dydx)
305 {
306     WORD32 row, col;
307     WORD16 i2_tmp;
308     UNUSED(pu1_tmp);
309     UNUSED(dydx);
310 
311     for(row = 0; row < ht; row++)
312     {
313         for(col = 0; col < wd; col++)
314         {
315             i2_tmp = 0; /*ih264_g_six_tap[] is the array containing the filter coeffs*/
316             i2_tmp = ih264_g_six_tap[0] *
317                             (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
318                      + ih264_g_six_tap[1] *
319                             (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
320                      + ih264_g_six_tap[2] *
321                             (pu1_src[col] + pu1_src[col + 1 * src_strd]);
322             i2_tmp = (i2_tmp + 16) >> 5;
323             pu1_dst[col] = CLIP_U8(i2_tmp);
324         }
325         pu1_src += src_strd;
326         pu1_dst += dst_strd;
327     }
328 }
329 
330 /*!
331  **************************************************************************
332  * \if Function name : ih264_inter_pred_luma_horz_hpel_vert_hpel \endif
333  *
334  * \brief
335  *    This function implements a two stage cascaded six tap filter. It
336  *    applies the six tap filter in the horizontal direction on the
337  *    predictor values, followed by applying the same filter in the
338  *    vertical direction on the output of the first stage. The six tap
339  *    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
340  *    interpolation process"
341  *
342  * \param pu1_src: Pointer to the buffer containing the predictor values.
343  *     pu1_src could point to the frame buffer or the predictor buffer.
344  * \param pu1_dst: Pointer to the destination buffer where the output of
345  *     the six tap filter is stored.
346  * \param ht: Height of the rectangular pixel grid to be interpolated
347  * \param wd: Width of the rectangular pixel grid to be interpolated
348  * \param src_strd: Width of the buffer pointed to by pu1_src.
349  * \param dst_strd: Width of the destination buffer
350  * \param pu1_tmp: temporary buffer.
351  * \param dydx: x and y reference offset for qpel calculations: UNUSED in this function.
352  *
353  * \return
354  *    None.
355  *
356  * \note
357  *    This function takes the 8 bit predictor values, applies the six tap
358  *    filter in the horizontal direction and outputs the result clipped to
359  *    8 bit precision. The input is stored in the buffer pointed to by
360  *    pu1_src while the output is stored in the buffer pointed by pu1_dst.
361  *    Both pu1_src and pu1_dst could point to the same buffer i.e. the
362  *    six tap filter could be done in place.
363  *
364  **************************************************************************
365  */
ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd,UWORD8 * pu1_tmp,WORD32 dydx)366 void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
367                                                UWORD8 *pu1_dst,
368                                                WORD32 src_strd,
369                                                WORD32 dst_strd,
370                                                WORD32 ht,
371                                                WORD32 wd,
372                                                UWORD8* pu1_tmp,
373                                                WORD32 dydx)
374 {
375     WORD32 row, col;
376     WORD32 tmp;
377     WORD16* pi2_pred1_temp;
378     WORD16* pi2_pred1;
379     UNUSED(dydx);
380     pi2_pred1_temp = (WORD16*)pu1_tmp;
381     pi2_pred1_temp += 2;
382     pi2_pred1 = pi2_pred1_temp;
383     for(row = 0; row < ht; row++)
384     {
385         for(col = -2; col < wd + 3; col++)
386         {
387             tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
388             tmp = ih264_g_six_tap[0] *
389                             (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
390                   + ih264_g_six_tap[1] *
391                             (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
392                   + ih264_g_six_tap[2] *
393                             (pu1_src[col] + pu1_src[col + 1 * src_strd]);
394             pi2_pred1_temp[col] = tmp;
395         }
396         pu1_src += src_strd;
397         pi2_pred1_temp = pi2_pred1_temp + wd + 5;
398     }
399 
400     for(row = 0; row < ht; row++)
401     {
402         for(col = 0; col < wd; col++)
403         {
404             tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
405             tmp = ih264_g_six_tap[0] *
406                             (pi2_pred1[col - 2] + pi2_pred1[col + 3])
407                   + ih264_g_six_tap[1] *
408                             (pi2_pred1[col - 1] + pi2_pred1[col + 2])
409                   + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1]);
410             tmp = (tmp + 512) >> 10;
411             pu1_dst[col] = CLIP_U8(tmp);
412         }
413         pi2_pred1 += (wd + 5);
414         pu1_dst += dst_strd;
415     }
416 }
417 
418 /*!
419  **************************************************************************
420  * \if Function name : ih264_inter_pred_luma_horz_qpel \endif
421  *
422  * \brief
423  *    This routine applies the six tap filter to the predictors in the
424  *    horizontal direction. The six tap filtering operation is described in
425  *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
426  *
427  * \param pu1_src: Pointer to the buffer containing the predictor values.
428  *     pu1_src could point to the frame buffer or the predictor buffer.
429  * \param pu1_dst: Pointer to the destination buffer where the output of
430  *     the six tap filter is stored.
431  * \param ht: Height of the rectangular pixel grid to be interpolated
432  * \param wd: Width of the rectangular pixel grid to be interpolated
433  * \param src_strd: Width of the buffer pointed to by pu1_src.
434  * \param dst_strd: Width of the destination buffer
435  * \param pu1_tmp: temporary buffer: UNUSED in this function
436  * \param dydx: x and y reference offset for qpel calculations.
437  *
438  * \return
439  *    None.
440  *
441  * \note
442  *    This function takes the 8 bit predictor values, applies the six tap
443  *    filter in the horizontal direction and outputs the result clipped to
444  *    8 bit precision. The input is stored in the buffer pointed to by
445  *    pu1_src while the output is stored in the buffer pointed by pu1_dst.
446  *    Both pu1_src and pu1_dst could point to the same buffer i.e. the
447  *    six tap filter could be done in place.
448  *
449  **************************************************************************
450  */
ih264_inter_pred_luma_horz_qpel(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd,UWORD8 * pu1_tmp,WORD32 dydx)451 void ih264_inter_pred_luma_horz_qpel(UWORD8 *pu1_src,
452                                      UWORD8 *pu1_dst,
453                                      WORD32 src_strd,
454                                      WORD32 dst_strd,
455                                      WORD32 ht,
456                                      WORD32 wd,
457                                      UWORD8* pu1_tmp,
458                                      WORD32 dydx)
459 {
460     WORD32 row, col;
461     UWORD8 *pu1_pred1;
462     WORD32 x_offset = dydx & 0x3;
463     UNUSED(pu1_tmp);
464     pu1_pred1 = pu1_src + (x_offset >> 1);
465 
466     for(row = 0; row < ht; row++)
467     {
468         for(col = 0; col < wd; col++, pu1_src++, pu1_dst++)
469         {
470             WORD16 i2_temp;
471             /* The logic below implements the following equation
472              i2_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) +
473              20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */
474             i2_temp = pu1_src[-2] + pu1_src[3]
475                       - (pu1_src[-1] + pu1_src[2])
476                       + ((pu1_src[0] + pu1_src[1] - pu1_src[-1] - pu1_src[2]) << 2)
477                       + ((pu1_src[0] + pu1_src[1]) << 4);
478             i2_temp = (i2_temp + 16) >> 5;
479             i2_temp = CLIP_U8(i2_temp);
480             *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1;
481 
482             pu1_pred1++;
483         }
484         pu1_dst += dst_strd - wd;
485         pu1_src += src_strd - wd;
486         pu1_pred1 += src_strd - wd;
487     }
488 }
489 
490 /*!
491  **************************************************************************
492  * \if Function name : ih264_inter_pred_luma_vert_qpel \endif
493  *
494  * \brief
495  *    This routine applies the six tap filter to the predictors in the
496  *    vertical direction and interpolates them to obtain pixels at quarter vertical
497  *    positions (0, 1/4) and (0, 3/4). The six tap filtering operation is
498  *    described in sec 8.4.2.2.1 titled "Luma sample interpolation process"
499  *
500  * \param pu1_src: Pointer to the buffer containing the predictor values.
501  *     pu1_src could point to the frame buffer or the predictor buffer.
502  * \param pu1_dst: Pointer to the destination buffer where the output of
503  *     the six tap filter is stored.
504  * \param ht: Height of the rectangular pixel grid to be interpolated
505  * \param wd: Width of the rectangular pixel grid to be interpolated
506  * \param src_strd: Width of the buffer pointed to by puc_pred.
507  * \param dst_strd: Width of the destination buffer
508  * \param pu1_tmp: temporary buffer: UNUSED in this function
509  * \param dydx: x and y reference offset for qpel calculations.
510  *
511  * \return
512  *    void
513  *
514  * \note
515  *    This function takes the 8 bit predictor values, applies the six tap
516  *    filter in the vertical direction and outputs the result clipped to
517  *    8 bit precision. The input is stored in the buffer pointed to by
518  *    puc_pred while the output is stored in the buffer pointed by puc_dest.
519  *    Both puc_pred and puc_dest could point to the same buffer i.e. the
520  *    six tap filter could be done in place.
521  *
522  * \para <title>
523  *    <paragraph>
524  *  ...
525  **************************************************************************
526  */
ih264_inter_pred_luma_vert_qpel(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd,UWORD8 * pu1_tmp,WORD32 dydx)527 void ih264_inter_pred_luma_vert_qpel(UWORD8 *pu1_src,
528                                      UWORD8 *pu1_dst,
529                                      WORD32 src_strd,
530                                      WORD32 dst_strd,
531                                      WORD32 ht,
532                                      WORD32 wd,
533                                      UWORD8* pu1_tmp,
534                                      WORD32 dydx)
535 {
536     WORD32 row, col;
537     WORD32 y_offset = dydx >> 2;
538     WORD32 off1, off2, off3;
539     UWORD8 *pu1_pred1;
540     UNUSED(pu1_tmp);
541     y_offset = y_offset & 0x3;
542 
543     off1 = src_strd;
544     off2 = src_strd << 1;
545     off3 = off1 + off2;
546 
547     pu1_pred1 = pu1_src + (y_offset >> 1) * src_strd;
548 
549     for(row = 0; row < ht; row++)
550     {
551         for(col = 0; col < wd; col++, pu1_dst++, pu1_src++, pu1_pred1++)
552         {
553             WORD16 i2_temp;
554             /* The logic below implements the following equation
555              i16_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] -
556              5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd])  +
557              20 * (puc_pred[0] + puc_pred[src_strd]); */
558             i2_temp = pu1_src[-off2] + pu1_src[off3]
559                        - (pu1_src[-off1] + pu1_src[off2])
560                        + ((pu1_src[0] + pu1_src[off1] - pu1_src[-off1] - pu1_src[off2]) << 2)
561                        + ((pu1_src[0] + pu1_src[off1]) << 4);
562             i2_temp = (i2_temp + 16) >> 5;
563             i2_temp = CLIP_U8(i2_temp);
564 
565             *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1;
566         }
567         pu1_src += src_strd - wd;
568         pu1_pred1 += src_strd - wd;
569         pu1_dst += dst_strd - wd;
570     }
571 }
572 
573 /*!
574  **************************************************************************
575  * \if Function name : ih264_inter_pred_luma_horz_qpel_vert_qpel \endif
576  *
577  * \brief
578  *    This routine applies the six tap filter to the predictors in the
579  *    vertical and horizontal direction and averages them to get pixels at locations
580  *    (1/4,1/4), (1/4, 3/4), (3/4, 1/4) & (3/4, 3/4). The six tap filtering operation
581  *    is described in sec 8.4.2.2.1 titled "Luma sample interpolation process"
582  *
583  * \param pu1_src: Pointer to the buffer containing the predictor values.
584  *     pu1_src could point to the frame buffer or the predictor buffer.
585  * \param pu1_dst: Pointer to the destination buffer where the output of
586  *     the six tap filter is stored.
587  * \param wd: Width of the rectangular pixel grid to be interpolated
588  * \param ht: Height of the rectangular pixel grid to be interpolated
589  * \param src_strd: Width of the buffer pointed to by puc_pred.
590  * \param dst_strd: Width of the destination buffer
591  * \param pu1_tmp: temporary buffer, UNUSED in this function
592  * \param dydx: x and y reference offset for qpel calculations.
593  *
594  * \return
595  *    void
596  *
597  * \note
598  *    This function takes the 8 bit predictor values, applies the six tap
599  *    filter in the vertical direction and outputs the result clipped to
600  *    8 bit precision. The input is stored in the buffer pointed to by
601  *    puc_pred while the output is stored in the buffer pointed by puc_dest.
602  *    Both puc_pred and puc_dest could point to the same buffer i.e. the
603  *    six tap filter could be done in place.
604  *
605  * \para <title>
606  *    <paragraph>
607  *  ...
608  **************************************************************************
609  */
ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd,UWORD8 * pu1_tmp,WORD32 dydx)610 void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
611                                                UWORD8 *pu1_dst,
612                                                WORD32 src_strd,
613                                                WORD32 dst_strd,
614                                                WORD32 ht,
615                                                WORD32 wd,
616                                                UWORD8* pu1_tmp,
617                                                WORD32 dydx)
618 {
619     WORD32 row, col;
620     WORD32 x_offset = dydx & 0x3;
621     WORD32 y_offset = dydx >> 2;
622 
623     WORD32 off1, off2, off3;
624     UWORD8* pu1_pred_vert, *pu1_pred_horz;
625     UNUSED(pu1_tmp);
626     y_offset = y_offset & 0x3;
627 
628     off1 = src_strd;
629     off2 = src_strd << 1;
630     off3 = off1 + off2;
631 
632     pu1_pred_horz = pu1_src + (y_offset >> 1) * src_strd;
633     pu1_pred_vert = pu1_src + (x_offset >> 1);
634 
635     for(row = 0; row < ht; row++)
636     {
637         for(col = 0; col < wd;
638                         col++, pu1_dst++, pu1_pred_vert++, pu1_pred_horz++)
639         {
640             WORD16 i2_temp_vert, i2_temp_horz;
641             /* The logic below implements the following equation
642              i2_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] -
643              5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd])  +
644              20 * (puc_pred[0] + puc_pred[src_strd]); */
645             i2_temp_vert = pu1_pred_vert[-off2] + pu1_pred_vert[off3]
646                             - (pu1_pred_vert[-off1] + pu1_pred_vert[off2])
647                             + ((pu1_pred_vert[0] + pu1_pred_vert[off1]
648                                             - pu1_pred_vert[-off1]
649                                             - pu1_pred_vert[off2]) << 2)
650                             + ((pu1_pred_vert[0] + pu1_pred_vert[off1]) << 4);
651             i2_temp_vert = (i2_temp_vert + 16) >> 5;
652             i2_temp_vert = CLIP_U8(i2_temp_vert);
653 
654             /* The logic below implements the following equation
655              i16_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) +
656              20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */
657             i2_temp_horz = pu1_pred_horz[-2] + pu1_pred_horz[3]
658                             - (pu1_pred_horz[-1] + pu1_pred_horz[2])
659                             + ((pu1_pred_horz[0] + pu1_pred_horz[1]
660                                             - pu1_pred_horz[-1]
661                                             - pu1_pred_horz[2]) << 2)
662                             + ((pu1_pred_horz[0] + pu1_pred_horz[1]) << 4);
663             i2_temp_horz = (i2_temp_horz + 16) >> 5;
664             i2_temp_horz = CLIP_U8(i2_temp_horz);
665             *pu1_dst = (i2_temp_vert + i2_temp_horz + 1) >> 1;
666         }
667         pu1_pred_vert += (src_strd - wd);
668         pu1_pred_horz += (src_strd - wd);
669         pu1_dst += (dst_strd - wd);
670     }
671 }
672 
673 /*!
674  **************************************************************************
675  * \if Function name : ih264_inter_pred_luma_horz_qpel_vert_hpel \endif
676  *
677  * \brief
678  *    This routine applies the six tap filter to the predictors in the vertical
679  *    and horizontal direction to obtain the pixel at (1/2,1/2). It then interpolates
680  *    pixel at (0,1/2) and (1/2,1/2) to obtain pixel at (1/4,1/2). Similarly for (3/4,1/2).
681  *    The six tap filtering operation is described in sec 8.4.2.2.1 titled
682  *    "Luma sample interpolation process"
683  *
684  * \param pu1_src: Pointer to the buffer containing the predictor values.
685  *     pu1_src could point to the frame buffer or the predictor buffer.
686  * \param pu1_dst: Pointer to the destination buffer where the output of
687  *     the six tap filter followed by interpolation is stored.
688  * \param wd: Width of the rectangular pixel grid to be interpolated
689  * \param ht: Height of the rectangular pixel grid to be interpolated
690  * \param src_strd: Width of the buffer pointed to by puc_pred.
691  * \param dst_strd: Width of the destination buffer
692  * \param pu1_tmp: buffer to store temporary output after 1st 6-tap filter.
693  * \param dydx: x and y reference offset for qpel calculations.
694  *
695  * \return
696  *    void
697  *
698  * \note
699  *    This function takes the 8 bit predictor values, applies the six tap
700  *    filter in the vertical direction and outputs the result clipped to
701  *    8 bit precision. The input is stored in the buffer pointed to by
702  *    puc_pred while the output is stored in the buffer pointed by puc_dest.
703  *    Both puc_pred and puc_dest could point to the same buffer i.e. the
704  *    six tap filter could be done in place.
705  *
706  * \para <title>
707  *    <paragraph>
708  *  ...
709  **************************************************************************
710  */
ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd,UWORD8 * pu1_tmp,WORD32 dydx)711 void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
712                                                UWORD8 *pu1_dst,
713                                                WORD32 src_strd,
714                                                WORD32 dst_strd,
715                                                WORD32 ht,
716                                                WORD32 wd,
717                                                UWORD8* pu1_tmp,
718                                                WORD32 dydx)
719 {
720     WORD32 row, col;
721     WORD32 tmp;
722     WORD16* pi2_pred1_temp, *pi2_pred1;
723     UWORD8* pu1_dst_tmp;
724     WORD32 x_offset = dydx & 0x3;
725     WORD16 i2_macro;
726 
727     pi2_pred1_temp = (WORD16*)pu1_tmp;
728     pi2_pred1_temp += 2;
729     pi2_pred1 = pi2_pred1_temp;
730     pu1_dst_tmp = pu1_dst;
731 
732     for(row = 0; row < ht; row++)
733     {
734         for(col = -2; col < wd + 3; col++)
735         {
736             tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
737             tmp = ih264_g_six_tap[0] *
738                             (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
739                   + ih264_g_six_tap[1] *
740                             (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
741                   + ih264_g_six_tap[2] *
742                             (pu1_src[col] + pu1_src[col + 1 * src_strd]);
743             pi2_pred1_temp[col] = tmp;
744         }
745 
746         pu1_src += src_strd;
747         pi2_pred1_temp = pi2_pred1_temp + wd + 5;
748     }
749 
750     pi2_pred1_temp = pi2_pred1;
751     for(row = 0; row < ht; row++)
752     {
753         for(col = 0; col < wd; col++)
754         {
755             tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
756             tmp = ih264_g_six_tap[0] *
757                             (pi2_pred1[col - 2] + pi2_pred1[col + 3])
758                   + ih264_g_six_tap[1] *
759                             (pi2_pred1[col - 1] + pi2_pred1[col + 2])
760                   + ih264_g_six_tap[2] *
761                             (pi2_pred1[col] + pi2_pred1[col + 1]);
762             tmp = (tmp + 512) >> 10;
763             pu1_dst[col] = CLIP_U8(tmp);
764         }
765         pi2_pred1 += (wd + 5);
766         pu1_dst += dst_strd;
767     }
768 
769     pu1_dst = pu1_dst_tmp;
770     pi2_pred1_temp += (x_offset >> 1);
771     for(row = ht; row != 0; row--)
772     {
773         for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++)
774         {
775             UWORD8 uc_temp;
776             /* Clipping the output of the six tap filter obtained from the
777              first stage of the 2d filter stage */
778             *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5;
779             i2_macro = (*pi2_pred1_temp);
780             uc_temp = CLIP_U8(i2_macro);
781             *pu1_dst = (*pu1_dst + uc_temp + 1) >> 1;
782         }
783         pi2_pred1_temp += 5;
784         pu1_dst += dst_strd - wd;
785     }
786 }
787 
788 /*!
789  **************************************************************************
790  * \if Function name : ih264_inter_pred_luma_horz_hpel_vert_qpel \endif
791  *
792  * \brief
793  *    This routine applies the six tap filter to the predictors in the horizontal
794  *    and vertical direction to obtain the pixel at (1/2,1/2). It then interpolates
795  *    pixel at (1/2,0) and (1/2,1/2) to obtain pixel at (1/2,1/4). Similarly for (1/2,3/4).
796  *    The six tap filtering operation is described in sec 8.4.2.2.1 titled
797  *    "Luma sample interpolation process"
798  *
799  * \param pu1_src: Pointer to the buffer containing the predictor values.
800  *     pu1_src could point to the frame buffer or the predictor buffer.
801  * \param pu1_dst: Pointer to the destination buffer where the output of
802  *     the six tap filter followed by interpolation is stored.
803  * \param wd: Width of the rectangular pixel grid to be interpolated
804  * \param ht: Height of the rectangular pixel grid to be interpolated
805  * \param src_strd: Width of the buffer pointed to by puc_pred.
806  * \param dst_strd: Width of the destination buffer
807  * \param pu1_tmp: buffer to store temporary output after 1st 6-tap filter.
808  * \param dydx: x and y reference offset for qpel calculations.
809  *
810  * \return
811  *    void
812  *
813  * \note
814  *    This function takes the 8 bit predictor values, applies the six tap
815  *    filter in the vertical direction and outputs the result clipped to
816  *    8 bit precision. The input is stored in the buffer pointed to by
817  *    puc_pred while the output is stored in the buffer pointed by puc_dest.
818  *    Both puc_pred and puc_dest could point to the same buffer i.e. the
819  *    six tap filter could be done in place.
820  *
821  * \para <title>
822  *    <paragraph>
823  *  ...
824  **************************************************************************
825  */
ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ht,WORD32 wd,UWORD8 * pu1_tmp,WORD32 dydx)826 void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
827                                                UWORD8 *pu1_dst,
828                                                WORD32 src_strd,
829                                                WORD32 dst_strd,
830                                                WORD32 ht,
831                                                WORD32 wd,
832                                                UWORD8* pu1_tmp,
833                                                WORD32 dydx)
834 {
835 
836     WORD32 row, col;
837     WORD32 tmp;
838     WORD32 y_offset = dydx >> 2;
839     WORD16* pi2_pred1_temp, *pi2_pred1;
840     UWORD8* pu1_dst_tmp;
841     //WORD32 x_offset = dydx & 0x3;
842     WORD16 i2_macro;
843 
844     y_offset = y_offset & 0x3;
845 
846     pi2_pred1_temp = (WORD16*)pu1_tmp;
847     pi2_pred1_temp += 2 * wd;
848     pi2_pred1 = pi2_pred1_temp;
849     pu1_dst_tmp = pu1_dst;
850     pu1_src -= 2 * src_strd;
851     for(row = -2; row < ht + 3; row++)
852     {
853         for(col = 0; col < wd; col++)
854         {
855             tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
856             tmp = ih264_g_six_tap[0] * (pu1_src[col - 2] + pu1_src[col + 3])
857                   + ih264_g_six_tap[1] * (pu1_src[col - 1] + pu1_src[col + 2])
858                   + ih264_g_six_tap[2] * (pu1_src[col] + pu1_src[col + 1]);
859             pi2_pred1_temp[col - 2 * wd] = tmp;
860         }
861 
862         pu1_src += src_strd;
863         pi2_pred1_temp += wd;
864     }
865     pi2_pred1_temp = pi2_pred1;
866     for(row = 0; row < ht; row++)
867     {
868         for(col = 0; col < wd; col++)
869         {
870             tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
871             tmp = ih264_g_six_tap[0] * (pi2_pred1[col - 2 * wd] + pi2_pred1[col + 3 * wd])
872                   + ih264_g_six_tap[1] * (pi2_pred1[col - 1 * wd] + pi2_pred1[col + 2 * wd])
873                   + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1 * wd]);
874             tmp = (tmp + 512) >> 10;
875             pu1_dst[col] = CLIP_U8(tmp);
876         }
877         pi2_pred1 += wd;
878         pu1_dst += dst_strd;
879     }
880     pu1_dst = pu1_dst_tmp;
881     pi2_pred1_temp += (y_offset >> 1) * wd;
882     for(row = ht; row != 0; row--)
883 
884     {
885         for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++)
886         {
887             UWORD8 u1_temp;
888             /* Clipping the output of the six tap filter obtained from the
889              first stage of the 2d filter stage */
890             *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5;
891             i2_macro = (*pi2_pred1_temp);
892             u1_temp = CLIP_U8(i2_macro);
893             *pu1_dst = (*pu1_dst + u1_temp + 1) >> 1;
894         }
895         //pi16_pred1_temp += wd;
896         pu1_dst += dst_strd - wd;
897     }
898 }
899 
900 /**
901  *******************************************************************************
902  *  function:ih264_inter_pred_luma_bilinear
903  *
904  * @brief
905  *    This routine applies the bilinear filter to the predictors .
906  *    The  filtering operation is described in
907  *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
908  *
909  * @par Description:
910 \note
911  *     This function is called to obtain pixels lying at the following
912  *    locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
913  *    The function averages the two adjacent values from the two input arrays in horizontal direction.
914  *
915  *
916  * @param[in] pu1_src1:
917  *  UWORD8 Pointer to the buffer containing the first input array.
918  *
919  * @param[in] pu1_src2:
920  *  UWORD8 Pointer to the buffer containing the second input array.
921  *
922  * @param[out] pu1_dst
923  *  UWORD8 pointer to the destination where the output of bilinear filter is stored.
924  *
925  * @param[in] src_strd1
926  *  Stride of the first input buffer
927  *
928  * @param[in] src_strd2
929  *  Stride of the second input buffer
930  *
931  * @param[in] dst_strd
932  *  integer destination stride of pu1_dst
933  *
934  * @param[in] ht
935  *  integer height of the array
936  *
937  * @param[in] wd
938  *  integer width of the array
939  *
940  * @returns
941  *
942  * @remarks
943  *  None
944  *
945  *******************************************************************************
946  */
ih264_inter_pred_luma_bilinear(UWORD8 * pu1_src1,UWORD8 * pu1_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 ht,WORD32 wd)947 void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
948                                     UWORD8 *pu1_src2,
949                                     UWORD8 *pu1_dst,
950                                     WORD32 src_strd1,
951                                     WORD32 src_strd2,
952                                     WORD32 dst_strd,
953                                     WORD32 ht,
954                                     WORD32 wd)
955 {
956     WORD32 row, col;
957     WORD16 i2_tmp;
958 
959     for(row = 0; row < ht; row++)
960     {
961         for(col = 0; col < wd; col++)
962         {
963             i2_tmp = pu1_src1[col] + pu1_src2[col];
964             i2_tmp = (i2_tmp + 1) >> 1;
965             pu1_dst[col] = CLIP_U8(i2_tmp);
966         }
967         pu1_src1 += src_strd1;
968         pu1_src2 += src_strd2;
969         pu1_dst += dst_strd;
970     }
971 
972 }
973 
974 /**
975  *******************************************************************************
976  *
977  * @brief
978  *    Interprediction chroma filter
979  *
980  * @par Description:
981  *   Applies filtering to chroma samples as mentioned in
982  *    sec 8.4.2.2.2 titled "chroma sample interpolation process"
983  *
984  * @param[in] pu1_src
985  *  UWORD8 pointer to the source containing alternate U and V samples
986  *
987  * @param[out] pu1_dst
988  *  UWORD8 pointer to the destination
989  *
990  * @param[in] src_strd
991  *  integer source stride
992  *
993  * @param[in] dst_strd
994  *  integer destination stride
995  *
996  * @param[in] u1_dx
997  *  dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
998  *
999  * @param[in] u1_dy
1000  *  dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
1001  *
1002  * @param[in] ht
1003  *  integer height of the array
1004  *
1005  * @param[in] wd
1006  *  integer width of the array
1007  *
1008  * @returns
1009  *
1010  * @remarks
1011  *  None
1012  *
1013  *******************************************************************************
1014  */
ih264_inter_pred_chroma(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 dx,WORD32 dy,WORD32 ht,WORD32 wd)1015 void ih264_inter_pred_chroma(UWORD8 *pu1_src,
1016                              UWORD8 *pu1_dst,
1017                              WORD32 src_strd,
1018                              WORD32 dst_strd,
1019                              WORD32 dx,
1020                              WORD32 dy,
1021                              WORD32 ht,
1022                              WORD32 wd)
1023 {
1024     WORD32 row, col;
1025     WORD16 i2_tmp;
1026 
1027     for(row = 0; row < ht; row++)
1028     {
1029         for(col = 0; col < 2 * wd; col++)
1030         {
1031             i2_tmp = 0; /* applies equation (8-266) in section 8.4.2.2.2 */
1032             i2_tmp = (8 - dx) * (8 - dy) * pu1_src[col]
1033                      + (dx) * (8 - dy) * pu1_src[col + 2]
1034                      + (8 - dx) * (dy) * (pu1_src + src_strd)[col]
1035                      + (dx) * (dy) * (pu1_src + src_strd)[col + 2];
1036             i2_tmp = (i2_tmp + 32) >> 6;
1037             pu1_dst[col] = CLIP_U8(i2_tmp);
1038         }
1039         pu1_src += src_strd;
1040         pu1_dst += dst_strd;
1041     }
1042 }
1043