1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_deblk.c
24 *
25 * @brief
26 *  Contains definition for the ctb level deblk function
27 *
28 * @author
29 *  ittiam
30 *
31 * @List of Functions:
32 *  ihevce_deblk_populate_qp_map()
33 *  ihevce_deblk_ctb()
34 *  ihevce_hbd_deblk_ctb()
35 *
36 * @remarks
37 *  None
38 *
39 *******************************************************************************
40 */
41 
42 /*****************************************************************************/
43 /* File Includes                                                             */
44 /*****************************************************************************/
45 /* System include files */
46 #include <stdio.h>
47 #include <string.h>
48 #include <stdlib.h>
49 #include <assert.h>
50 #include <stdarg.h>
51 #include <math.h>
52 
53 /* User include files */
54 #include "ihevc_typedefs.h"
55 #include "itt_video_api.h"
56 #include "ihevce_api.h"
57 
58 #include "rc_cntrl_param.h"
59 #include "rc_frame_info_collector.h"
60 #include "rc_look_ahead_params.h"
61 
62 #include "ihevc_defs.h"
63 #include "ihevc_debug.h"
64 #include "ihevc_structs.h"
65 #include "ihevc_platform_macros.h"
66 #include "ihevc_deblk.h"
67 #include "ihevc_deblk_tables.h"
68 #include "ihevc_common_tables.h"
69 #include "ihevc_itrans_recon.h"
70 #include "ihevc_chroma_itrans_recon.h"
71 #include "ihevc_chroma_intra_pred.h"
72 #include "ihevc_intra_pred.h"
73 #include "ihevc_inter_pred.h"
74 #include "ihevc_mem_fns.h"
75 #include "ihevc_padding.h"
76 #include "ihevc_weighted_pred.h"
77 #include "ihevc_sao.h"
78 #include "ihevc_resi_trans.h"
79 #include "ihevc_quant_iquant_ssd.h"
80 #include "ihevc_cabac_tables.h"
81 
82 #include "ihevce_defs.h"
83 #include "ihevce_hle_interface.h"
84 #include "ihevce_lap_enc_structs.h"
85 #include "ihevce_multi_thrd_structs.h"
86 #include "ihevce_me_common_defs.h"
87 #include "ihevce_had_satd.h"
88 #include "ihevce_error_codes.h"
89 #include "ihevce_bitstream.h"
90 #include "ihevce_cabac.h"
91 #include "ihevce_rdoq_macros.h"
92 #include "ihevce_function_selector.h"
93 #include "ihevce_enc_structs.h"
94 #include "ihevce_entropy_structs.h"
95 #include "ihevce_cmn_utils_instr_set_router.h"
96 #include "ihevce_enc_loop_structs.h"
97 #include "ihevce_common_utils.h"
98 #include "ihevce_global_tables.h"
99 #include "ihevce_deblk.h"
100 #include "ihevce_tile_interface.h"
101 
102 /*****************************************************************************/
103 /* Function Definitions                                                      */
104 /*****************************************************************************/
105 
106 /*!
107 ******************************************************************************
108 * \if Function name : ihevce_deblk_populate_qp_map \endif
109 *
110 * \brief
111 *
112 *
113 *****************************************************************************
114 */
ihevce_deblk_populate_qp_map(ihevce_enc_loop_ctxt_t * ps_ctxt,deblk_ctbrow_prms_t * ps_deblk_ctb_row_params,ctb_enc_loop_out_t * ps_ctb_out_dblk,WORD32 vert_ctr,frm_ctb_ctxt_t * ps_frm_ctb_prms,ihevce_tile_params_t * ps_col_tile_params)115 void ihevce_deblk_populate_qp_map(
116     ihevce_enc_loop_ctxt_t *ps_ctxt,
117     deblk_ctbrow_prms_t *ps_deblk_ctb_row_params,
118     ctb_enc_loop_out_t *ps_ctb_out_dblk,
119     WORD32 vert_ctr,
120     frm_ctb_ctxt_t *ps_frm_ctb_prms,
121     ihevce_tile_params_t *ps_col_tile_params)
122 {
123     ctb_enc_loop_out_t *ps_ctb_out;
124     WORD32 ctb_ctr, ctb_start, ctb_end;
125     WORD32 tile_qp_offset, tile_qp_size, i4_offset_for_last_cu_qp;
126     /* Create the Qp map for the entire current CTB-row for deblocking purpose(only)*/
127     /* Do this iff cur pic is referred or recon dump is enabled or psnr calc is on*/
128     /*Qp of the last CU of previous CTB row*/
129     WORD8 i1_last_cu_qp;
130     /*A pointer pointing to the top 4x4 block's Qp for all CTb rows*/
131     WORD8 *pi1_qp_top_4x4_ctb_row =
132         ps_deblk_ctb_row_params->api1_qp_top_4x4_ctb_row[ps_ctxt->i4_enc_frm_id] +
133         (ps_deblk_ctb_row_params->u4_qp_top_4x4_buf_size * ps_ctxt->i4_bitrate_instance_num);
134 
135     UWORD32 u4_qp_top_4x4_buf_strd = ps_deblk_ctb_row_params->u4_qp_top_4x4_buf_strd;
136 
137     /*The Qp map which has to be populated*/
138     UWORD32 u4_qp_buffer_stride = ps_deblk_ctb_row_params->u4_qp_buffer_stride;
139     WORD8 *pi1_ctb_tile_qp = ps_deblk_ctb_row_params->pi1_ctb_row_qp;
140 
141     /*Temporary pointers to Qp map at CTB level*/
142     WORD8 *pi1_ctb_qp_map_tile;
143 
144     i4_offset_for_last_cu_qp = ps_ctxt->pi4_offset_for_last_cu_qp[ps_ctxt->i4_tile_col_idx];
145     /* total QPs to be copied for current row is : */
146     tile_qp_size = i4_offset_for_last_cu_qp + 1;
147     /*Pointing to the first CTB of current CTB row*/
148     ps_ctb_out = ps_ctb_out_dblk;
149     /* Offset req. for the row QP to the tile start */
150     tile_qp_offset = ps_col_tile_params->i4_first_ctb_x * (ps_frm_ctb_prms->i4_ctb_size / 4);
151 
152     ctb_start = ps_col_tile_params->i4_first_ctb_x;
153     ctb_end =
154         (ps_col_tile_params->i4_first_ctb_x + ps_col_tile_params->i4_curr_tile_wd_in_ctb_unit);
155 
156     if(vert_ctr) /*Not first CTB row of frame*/
157     {
158         /*copy from top4x4_array data stored by upper CTB-row to qp-map*/
159         memcpy(
160             pi1_ctb_tile_qp,
161             (pi1_qp_top_4x4_ctb_row + (vert_ctr - 1) * u4_qp_top_4x4_buf_strd + tile_qp_offset),
162             tile_qp_size);
163     }
164 
165     /*pu1_ctb_row_qp points to top4x4 row in Qp-map.
166     Now pointing pu1_ctb_qp_map to cur 4x4 row*/
167     pi1_ctb_qp_map_tile = pi1_ctb_tile_qp + u4_qp_buffer_stride;
168 
169     /* This i1_last_cu_qp will be conditionally overwritten later */
170     i1_last_cu_qp = ps_ctxt->i4_frame_qp;
171 
172     /* -- Loop over all the CTBs in a CTB-row for populating the Qp-map ----- */
173     for(ctb_ctr = ctb_start; ctb_ctr < ctb_end; ctb_ctr++)
174     {
175         WORD32 cu_ctr;
176         cu_enc_loop_out_t *ps_curr_cu;
177 
178         /* Update i1_last_cu_qp based on CTB's position in tile */
179         update_last_coded_cu_qp(
180             (ps_deblk_ctb_row_params->pi1_ctb_row_qp + i4_offset_for_last_cu_qp),
181             ps_ctxt->i1_entropy_coding_sync_enabled_flag,
182             ps_frm_ctb_prms,
183             ps_ctxt->i4_frame_qp,
184             vert_ctr,
185             ctb_ctr,
186             &i1_last_cu_qp);
187 
188         /* store the pointer of first cu of current ctb */
189         ps_curr_cu = ps_ctb_out->ps_enc_cu;
190 
191         /* --------- loop over all the CUs in the CTB --------------- */
192         for(cu_ctr = 0; cu_ctr < ps_ctb_out->u1_num_cus_in_ctb; cu_ctr++)
193         {
194             UWORD8 u1_vert_4x4, u1_horz_4x4;  //for_loop counters
195             WORD8 *pi1_cu_qp_map;
196 
197             WORD8 i1_qp, i1_qp_left, i1_qp_top;
198 
199             pi1_cu_qp_map = pi1_ctb_qp_map_tile +
200                             (ps_curr_cu->b3_cu_pos_y * 2) * u4_qp_buffer_stride +
201                             (ps_curr_cu->b3_cu_pos_x * 2);
202 
203             /*If the current CU is coded in skip_mode/zero_CBF then
204             for deblocking, Qp of the previously coded CU will be used*/
205             if(ps_curr_cu->b1_skip_flag || ps_curr_cu->b1_no_residual_syntax_flag)
206             {
207                 if(0 == ps_curr_cu->b3_cu_pos_x)
208                     i1_qp_left = i1_last_cu_qp;
209                 else
210                     i1_qp_left = *(pi1_cu_qp_map - 1);
211 
212                 if(0 == ps_curr_cu->b3_cu_pos_y)
213                     i1_qp_top = i1_last_cu_qp;
214                 else
215                     i1_qp_top = *(pi1_cu_qp_map - u4_qp_buffer_stride);
216 
217                 i1_qp = (i1_qp_left + i1_qp_top + 1) / 2;
218 
219                 if(0 == ps_curr_cu->b1_first_cu_in_qg)
220                 {
221                     i1_qp = i1_last_cu_qp;
222                 }
223             }
224             else
225             {
226                 i1_qp = ps_curr_cu->i1_cu_qp;
227             }
228 
229             i1_last_cu_qp = i1_qp;
230 
231             /*---- Loop for populating Qp map for the current CU -------*/
232             for(u1_vert_4x4 = 0; u1_vert_4x4 < (ps_curr_cu->b4_cu_size * 2); u1_vert_4x4++)
233             {
234                 for(u1_horz_4x4 = 0; u1_horz_4x4 < (ps_curr_cu->b4_cu_size * 2); u1_horz_4x4++)
235                 {
236                     pi1_cu_qp_map[u1_horz_4x4] = i1_qp;
237                 }
238                 pi1_cu_qp_map += u4_qp_buffer_stride;
239             }
240             /*Update Qp-map ptr. Qp map is at 4x4 level but b4_cu_size is at 8x8 level*/
241             ps_curr_cu++;
242         }
243         pi1_ctb_qp_map_tile += (ps_frm_ctb_prms->i4_ctb_size / 4);  //one qp per 4x4 block.
244         ps_ctb_out++;
245 
246     }  //for(ctb_ctr = 0; ctb_ctr < num_ctbs_horz; ctb_ctr++)
247 
248     /*fill into the top4x4_array Qp for the lower CTB-row from bottom part of cur CTB row*/
249     memcpy(
250         (pi1_qp_top_4x4_ctb_row + vert_ctr * u4_qp_top_4x4_buf_strd + tile_qp_offset),
251         (pi1_ctb_tile_qp + (ps_frm_ctb_prms->i4_ctb_size / 4) * u4_qp_buffer_stride),
252         tile_qp_size);
253 }
254 
255 /**
256 *******************************************************************************
257 *
258 * @brief
259 *   Deblock CTB level function.
260 *
261 * @par Description:
262 *   For a given CTB, deblocking on both vertical and
263 *   horizontal edges is done. Both the luma and chroma
264 *   blocks are processed
265 *
266 * @param[in]
267 *   ps_deblk:   Pointer to the deblock context
268 *   last_col:   if the CTB is the last CTB of current CTB-row value is 1 else 0
269 *   ps_deblk_ctb_row_params: deblk ctb row params
270 *
271 * @returns
272 *
273 * @remarks
274 *  None
275 *
276 *******************************************************************************
277 */
ihevce_deblk_ctb(deblk_ctb_params_t * ps_deblk,WORD32 last_col,deblk_ctbrow_prms_t * ps_deblk_ctb_row_params)278 void ihevce_deblk_ctb(
279     deblk_ctb_params_t *ps_deblk, WORD32 last_col, deblk_ctbrow_prms_t *ps_deblk_ctb_row_params)
280 {
281     WORD32 ctb_size;
282     UWORD32 u4_bs;
283     WORD32 bs_lz; /*Leading zeros in boundary strength*/
284     WORD32 qp_p, qp_q;
285     UWORD8 *pu1_src;
286     UWORD8 *pu1_src_uv;
287     UWORD8 *pu1_curr_src;
288     WORD32 col_size;
289     WORD32 col, row, i4_edge_count;
290     WORD32 num_columns_for_vert_filt;
291     WORD32 num_blks_for_vert_filt;
292     WORD32 num_rows_for_horz_filt;
293 
294     ihevc_deblk_chroma_horz_ft *pf_deblk_chroma_horz;
295     ihevc_deblk_chroma_horz_ft *pf_deblk_chroma_vert;
296 
297     /* Filter flags are packed along with the qp info.
298     6 out of the 8 bits correspond to qp and 1 to filter flag. */
299     /* filter_p and filter_q are initialized to 1.
300     They are to be extracted along with the qp info. */
301     WORD32 filter_p, filter_q;
302     WORD8 *pi1_ctb_row_qp_p, *pi1_ctb_row_qp_temp;
303     WORD8 *pi1_ctb_row_qp_q;
304 
305     func_selector_t *ps_func_slector = ps_deblk->ps_func_selector;
306 
307     WORD32 left_luma_edge_filter_flag = ps_deblk->i4_deblock_left_ctb_edge;
308     WORD32 top_luma_edge_filter_flag = ps_deblk->i4_deblock_top_ctb_edge;
309     WORD32 left_chroma_edge_filter_flag = ps_deblk->i4_deblock_left_ctb_edge;
310     WORD32 top_chroma_edge_filter_flag = ps_deblk->i4_deblock_top_ctb_edge;
311     UWORD32 *bs_vert = ps_deblk_ctb_row_params->pu4_ctb_row_bs_vert;
312     UWORD32 *bs_horz = ps_deblk_ctb_row_params->pu4_ctb_row_bs_horz;
313     UWORD32 *bs_vert_uv = bs_vert;
314     UWORD32 *bs_horz_uv = bs_horz;
315     UWORD32 u4_qp_buffer_stride = ps_deblk_ctb_row_params->u4_qp_buffer_stride;
316     UWORD8 u1_is_422 = (ps_deblk->u1_chroma_array_type == 2);
317 
318     if(u1_is_422)
319     {
320         pf_deblk_chroma_horz = ps_func_slector->ihevc_deblk_422chroma_horz_fptr;
321         pf_deblk_chroma_vert = ps_func_slector->ihevc_deblk_422chroma_vert_fptr;
322     }
323     else
324     {
325         pf_deblk_chroma_horz = ps_func_slector->ihevc_deblk_chroma_horz_fptr;
326         pf_deblk_chroma_vert = ps_func_slector->ihevc_deblk_chroma_vert_fptr;
327     }
328 
329     ctb_size = ps_deblk->i4_ctb_size;
330 
331     /* The PCM filter flag and bypass trans flag are always set to 1 in encoder profile */
332     /* Can be removed during optimization */
333     filter_q = 1;
334     filter_p = 1;
335 
336     //////////////////////////////////////////////////////////////////////////////
337     /* Luma Veritcal Edge */
338     pu1_src = ps_deblk->pu1_ctb_y;
339     pi1_ctb_row_qp_temp = ps_deblk_ctb_row_params->pi1_ctb_row_qp + u4_qp_buffer_stride;
340     num_columns_for_vert_filt = ctb_size / 8;
341     num_blks_for_vert_filt = ctb_size / 4;
342 
343     for(i4_edge_count = 0; i4_edge_count < num_columns_for_vert_filt; i4_edge_count++)
344     {
345         u4_bs = *bs_vert;
346         /* get the current 4x4 vertical pointer */
347         pu1_curr_src = pu1_src;
348         pi1_ctb_row_qp_q = pi1_ctb_row_qp_temp + (i4_edge_count << 1);
349 
350         /* If the current edge is not the 1st edge of frame or slice */
351         if(1 == left_luma_edge_filter_flag)
352         {
353             for(row = 0; row < num_blks_for_vert_filt;)
354             {
355                 bs_lz = CLZ(u4_bs) >> 1;
356                 /* If BS = 0, skip the egde filtering */
357                 if(0 != bs_lz)
358                 {
359                     u4_bs = u4_bs << (bs_lz << 1);
360                     pu1_curr_src += ((bs_lz << 2) * ps_deblk->i4_luma_pic_stride);
361                     pi1_ctb_row_qp_q += (bs_lz * u4_qp_buffer_stride);
362                     row += bs_lz;
363                     continue;
364                 }
365                 qp_p = *(pi1_ctb_row_qp_q - 1);
366                 qp_q = *pi1_ctb_row_qp_q;
367 
368                 ps_func_slector->ihevc_deblk_luma_vert_fptr(
369                     pu1_curr_src,
370                     ps_deblk->i4_luma_pic_stride,
371                     (u4_bs >> 30), /* bits 31 and 30 are extracted */
372                     qp_p,
373                     qp_q,
374                     ps_deblk->i4_beta_offset_div2,
375                     ps_deblk->i4_tc_offset_div2,
376                     filter_p,
377                     filter_q);
378 
379                 u4_bs = u4_bs << 2;
380                 pu1_curr_src += (ps_deblk->i4_luma_pic_stride << 2);
381                 pi1_ctb_row_qp_q += u4_qp_buffer_stride;
382                 row++;
383             }
384         }
385 
386         /* Increment the boundary strength and src pointer for the next column */
387         bs_vert += 1;
388         pu1_src += 8;
389 
390         /* Enable for the next edges of ctb*/
391         left_luma_edge_filter_flag = 1;
392     }
393 
394     //////////////////////////////////////////////////////////////////////////////
395     /* Chroma Veritcal Edge */
396     pu1_src_uv = ps_deblk->pu1_ctb_uv;
397     pi1_ctb_row_qp_temp = ps_deblk_ctb_row_params->pi1_ctb_row_qp + u4_qp_buffer_stride;
398 
399     /* Column spacing is 4 for each chroma component */
400     /* and hence 8 when they are interleaved. */
401     /* But, only those columns with a x co-ordinate */
402     /* that is divisiblee by 8 are filtered */
403     /* Hence, denominator is 16 */
404     num_columns_for_vert_filt = ctb_size / 16;
405     /* blk_size is 4 and chroma_ctb_height is ctb_size/2 */
406     num_blks_for_vert_filt = (0 == u1_is_422) ? (ctb_size / 2) / 4 : (ctb_size) / 4;
407 
408     for(i4_edge_count = 0; i4_edge_count < num_columns_for_vert_filt; i4_edge_count++)
409     {
410         /* Every alternate boundary strength value is used for 420 chroma */
411         u4_bs = *(bs_vert_uv) & ((0 == u1_is_422) ? 0x88888888 : 0xaaaaaaaa);
412         pu1_curr_src = pu1_src_uv;
413         pi1_ctb_row_qp_q = pi1_ctb_row_qp_temp + (i4_edge_count << 2);
414 
415         /* If the current edge is not the 1st edge of frame or slice */
416         if(1 == left_chroma_edge_filter_flag)
417         {
418             /* Each 'bs' is 2 bits long */
419             /* The divby4 in 420 is */
420             /* necessitated by the fact that */
421             /* chroma ctb_ht is half that of luma */
422             WORD32 i4_log2_num_bits_per_bs = ((0 == u1_is_422) + 1);
423             /* i4_sub_heightC = 2 for 420 */
424             /* i4_sub_heightC = 1 for 422 */
425             WORD32 i4_sub_heightC = i4_log2_num_bits_per_bs;
426 
427             for(row = 0; row < num_blks_for_vert_filt;)
428             {
429                 bs_lz = CLZ(u4_bs) >> i4_log2_num_bits_per_bs;
430 
431                 /* If BS = 0, skip the egde filtering */
432                 if(0 != bs_lz)
433                 {
434                     row += bs_lz;
435                     u4_bs = u4_bs << (bs_lz << i4_log2_num_bits_per_bs);
436                     /* '<<2' because of blk_size being 4x4 */
437                     pu1_curr_src += ((bs_lz << 2) * ps_deblk->i4_chroma_pic_stride);
438 
439                     /* In 420, every alternate QP row is skipped, because chroma height */
440                     /* In 422, no row is skipped */
441                     pi1_ctb_row_qp_q += ((u4_qp_buffer_stride << (i4_sub_heightC - 1)) * bs_lz);
442 
443                     continue;
444                 }
445 
446                 qp_p = *(pi1_ctb_row_qp_q - i4_sub_heightC);
447                 qp_q = *pi1_ctb_row_qp_q;
448 
449                 pf_deblk_chroma_vert(
450                     pu1_curr_src,
451                     ps_deblk->i4_chroma_pic_stride,
452                     qp_p,
453                     qp_q,
454                     ps_deblk->i4_cb_qp_indx_offset,
455                     ps_deblk->i4_cr_qp_indx_offset,
456                     ps_deblk->i4_tc_offset_div2,
457                     filter_p,
458                     filter_q);
459 
460                 u4_bs = u4_bs << (1 << i4_log2_num_bits_per_bs);
461                 pu1_curr_src += (ps_deblk->i4_chroma_pic_stride << 2);
462                 pi1_ctb_row_qp_q += (u4_qp_buffer_stride << (i4_sub_heightC - 1));
463                 row++;
464             }
465         }
466         /* Increment the boundary strength by 2 and src pointer for the next column */
467         /* As the edge filtering happens for alternate column */
468         bs_vert_uv += 2;
469         pu1_src_uv += 16;
470         left_chroma_edge_filter_flag = 1;
471     }
472 
473     //////////////////////////////////////////////////////////////////////////////
474 
475     /* Luma Horizontal Edge */
476     pu1_src = ps_deblk->pu1_ctb_y;
477     col_size = ctb_size / 4;
478 
479     /* If the ctb is the 1st ctb of row,                     */
480     /* Decrement the loop count to exclude filtering of last 4 pixels */
481     /* else shift the src pointer by 4 pixels to do filtering for shifted ctb */
482     if(ps_deblk->i4_deblock_left_ctb_edge == 1)
483     {
484         pu1_src -= 4;
485         /*If the ctb is at the horizonatl end of PIC*/
486         /* Increase the column size to filter last 4 pixels */
487         col_size += last_col;
488     }
489     else if(!last_col)
490     {
491         col_size -= 1;
492     }
493     {
494         UWORD8 *pu1_src_temp = pu1_src;
495         //pu1_ctb_row_qp_p and pu1_ctb_row_qp_q point to alternate rows
496         pi1_ctb_row_qp_p = ps_deblk_ctb_row_params->pi1_ctb_row_qp;
497 
498         num_rows_for_horz_filt = ctb_size / 8;
499 
500         for(i4_edge_count = 0; i4_edge_count < num_rows_for_horz_filt; i4_edge_count++)
501         {
502             WORD32 col_size_temp = col_size;
503             pi1_ctb_row_qp_q = pi1_ctb_row_qp_p + u4_qp_buffer_stride;
504             pu1_src = pu1_src_temp + (i4_edge_count * 8 * ps_deblk->i4_luma_pic_stride);
505 
506             if(1 == top_luma_edge_filter_flag)
507             {
508                 //Deblock the last vertical_4x4_column of previous CTB
509                 if(ps_deblk->i4_deblock_left_ctb_edge == 1)
510                 {
511                     u4_bs = ps_deblk->au1_prev_bs[i4_edge_count] & 0x3;
512                     if(u4_bs != 0)
513                     {
514                         qp_p = *(pi1_ctb_row_qp_p - 1);
515                         qp_q = *(pi1_ctb_row_qp_q - 1);
516 
517                         ps_func_slector->ihevc_deblk_luma_horz_fptr(
518                             pu1_src,
519                             ps_deblk->i4_luma_pic_stride,
520                             u4_bs,
521                             qp_p,
522                             qp_q,
523                             ps_deblk->i4_beta_offset_div2,
524                             ps_deblk->i4_tc_offset_div2,
525                             1,
526                             1);
527                     }
528 
529                     pu1_src += 4;
530                     col_size_temp--;
531                 }
532                 //Start deblocking current CTB
533                 u4_bs = *(bs_horz);
534 
535                 for(col = 0; col < col_size_temp;)
536                 {
537                     bs_lz = CLZ(u4_bs) >> 1;
538                     if(0 != bs_lz)
539                     {
540                         u4_bs = u4_bs << (bs_lz << 1);
541                         pu1_src += 4 * bs_lz;
542                         col += bs_lz;
543                         continue;
544                     }
545                     qp_p = *(pi1_ctb_row_qp_p + col);
546                     qp_q = *(pi1_ctb_row_qp_q + col);
547 
548                     ps_func_slector->ihevc_deblk_luma_horz_fptr(
549                         pu1_src,
550                         ps_deblk->i4_luma_pic_stride,
551                         u4_bs >> (sizeof(u4_bs) * 8 - 2),
552                         qp_p,
553                         qp_q,
554                         ps_deblk->i4_beta_offset_div2,
555                         ps_deblk->i4_tc_offset_div2,
556                         filter_p,
557                         filter_q);
558 
559                     pu1_src += 4;
560                     u4_bs = u4_bs << 2;
561                     col++;
562                 }
563                 //Store the last vertical_4x4 column of CTB's info for next CTB deblocking
564                 u4_bs = *bs_horz;
565                 ps_deblk->au1_prev_bs[i4_edge_count] =
566                     (UWORD8)(((u4_bs << ((ctb_size >> 1) - 2))) >> 30);
567             }
568             bs_horz += 1;
569             pi1_ctb_row_qp_p += (u4_qp_buffer_stride << 1);
570             top_luma_edge_filter_flag = 1;
571         }
572     }
573 
574     //////////////////////////////////////////////////////////////////////////////
575     /* Chroma Horizontal Edge */
576     pu1_src_uv = ps_deblk->pu1_ctb_uv;
577     col_size = ctb_size / 8;
578 
579     /* If the ctb is the 1st ctb of row,                     */
580     /* Decrement the loop count to exclude filtering of last 4 pixels */
581     /* else shift the src pointer by 8 (uv) pixels to do filtering for shifted ctb */
582     if(ps_deblk->i4_deblock_left_ctb_edge == 1)
583     {
584         pu1_src_uv -= 8;
585 
586         /*If the ctb is at the horizonatl end of PIC*/
587         /* Increase the column size to filter last 8 (uv) pixels */
588         col_size += last_col;
589     }
590     else if(!last_col)
591     {
592         col_size--;
593     }
594 
595     {
596         UWORD8 *pu1_src_temp = pu1_src_uv;
597 
598         //pu1_ctb_row_qp_p and pu1_ctb_row_qp_q point to alternate rows
599         pi1_ctb_row_qp_p = ps_deblk_ctb_row_params->pi1_ctb_row_qp;
600         num_rows_for_horz_filt = ctb_size / ((0 == u1_is_422) ? 16 : 8);
601 
602         for(i4_edge_count = 0; i4_edge_count < num_rows_for_horz_filt; i4_edge_count++)
603         {
604             WORD32 col_size_temp = col_size;
605 
606             pi1_ctb_row_qp_q = pi1_ctb_row_qp_p + u4_qp_buffer_stride;
607             pu1_src_uv = pu1_src_temp + (i4_edge_count * 8 * ps_deblk->i4_chroma_pic_stride);
608 
609             if(1 == top_chroma_edge_filter_flag)
610             {
611                 //Deblock the last vertical _4x4_column of previous CTB
612                 if(ps_deblk->i4_deblock_left_ctb_edge == 1)
613                 {
614                     u4_bs = ps_deblk->au1_prev_bs_uv[i4_edge_count] & 0x2;
615 
616                     if(u4_bs == 2)
617                     {
618                         qp_p = *(pi1_ctb_row_qp_p - 1);
619                         qp_q = *(pi1_ctb_row_qp_q - 1);
620 
621                         pf_deblk_chroma_horz(
622                             pu1_src_uv,
623                             ps_deblk->i4_chroma_pic_stride,
624                             qp_p,
625                             qp_q,
626                             ps_deblk->i4_cb_qp_indx_offset,
627                             ps_deblk->i4_cr_qp_indx_offset,
628                             ps_deblk->i4_tc_offset_div2,
629                             1,
630                             1);
631                     }
632 
633                     pu1_src_uv += 8;
634                     col_size_temp--;
635                 }
636 
637                 //Start deblocking current CTB
638                 u4_bs = *(bs_horz_uv)&0x88888888;
639 
640                 for(col = 0; col < col_size_temp;)
641                 {
642                     bs_lz = CLZ(u4_bs) >> 2;
643 
644                     if(0 != bs_lz)
645                     {
646                         u4_bs = u4_bs << (bs_lz << 2);
647                         pu1_src_uv += (8 * bs_lz);
648 
649                         col += bs_lz;
650                         continue;
651                     }
652 
653                     qp_p = *(pi1_ctb_row_qp_p + (col << 1));
654                     qp_q = *(pi1_ctb_row_qp_q + (col << 1));
655 
656                     pf_deblk_chroma_horz(
657                         pu1_src_uv,
658                         ps_deblk->i4_chroma_pic_stride,
659                         qp_p,
660                         qp_q,
661                         ps_deblk->i4_cb_qp_indx_offset,
662                         ps_deblk->i4_cr_qp_indx_offset,
663                         ps_deblk->i4_tc_offset_div2,
664                         filter_p,
665                         filter_q);
666 
667                     pu1_src_uv += 8;
668                     u4_bs = u4_bs << 4;
669                     col++;
670                 }
671 
672                 //Store the last vertical_4x4 column of CTB's info for next CTB deblocking
673                 u4_bs = *bs_horz_uv;
674                 ps_deblk->au1_prev_bs_uv[i4_edge_count] =
675                     (UWORD8)(((u4_bs << ((ctb_size >> 1) - 4))) >> 30);
676             }
677 
678             bs_horz_uv += ((0 == u1_is_422) + 1);
679             pi1_ctb_row_qp_p += (u4_qp_buffer_stride << ((0 == u1_is_422) + 1));
680             top_chroma_edge_filter_flag = 1;
681         }
682     }
683 
684     return;
685 }
686