1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 ******************************************************************************
23 * @file hme_subpel.c
24 *
25 * @brief
26 *    Subpel refinement modules for ME algo
27 *
28 * @author
29 *    Ittiam
30 *
31 *
32 * List of Functions
33 * hme_qpel_interp_avg()
34 * hme_subpel_refine_ctblist_bck()
35 * hme_subpel_refine_ctblist_fwd()
36 * hme_refine_bidirect()
37 * hme_subpel_refinement()
38 * hme_subpel_refine_ctb_fwd()
39 * hme_subpel_refine_ctb_bck()
40 * hme_create_bck_inp()
41 * hme_subpel_refine_search_node()
42 ******************************************************************************
43 */
44 
45 /*****************************************************************************/
46 /* File Includes                                                             */
47 /*****************************************************************************/
48 /* System include files */
49 #include <stdio.h>
50 #include <string.h>
51 #include <stdlib.h>
52 #include <assert.h>
53 #include <stdarg.h>
54 #include <math.h>
55 #include <limits.h>
56 
57 /* User include files */
58 #include "ihevc_typedefs.h"
59 #include "itt_video_api.h"
60 #include "ihevce_api.h"
61 
62 #include "rc_cntrl_param.h"
63 #include "rc_frame_info_collector.h"
64 #include "rc_look_ahead_params.h"
65 
66 #include "ihevc_defs.h"
67 #include "ihevc_structs.h"
68 #include "ihevc_platform_macros.h"
69 #include "ihevc_deblk.h"
70 #include "ihevc_itrans_recon.h"
71 #include "ihevc_chroma_itrans_recon.h"
72 #include "ihevc_chroma_intra_pred.h"
73 #include "ihevc_intra_pred.h"
74 #include "ihevc_inter_pred.h"
75 #include "ihevc_mem_fns.h"
76 #include "ihevc_padding.h"
77 #include "ihevc_weighted_pred.h"
78 #include "ihevc_sao.h"
79 #include "ihevc_resi_trans.h"
80 #include "ihevc_quant_iquant_ssd.h"
81 #include "ihevc_cabac_tables.h"
82 
83 #include "ihevce_defs.h"
84 #include "ihevce_lap_enc_structs.h"
85 #include "ihevce_multi_thrd_structs.h"
86 #include "ihevce_multi_thrd_funcs.h"
87 #include "ihevce_me_common_defs.h"
88 #include "ihevce_had_satd.h"
89 #include "ihevce_error_codes.h"
90 #include "ihevce_bitstream.h"
91 #include "ihevce_cabac.h"
92 #include "ihevce_rdoq_macros.h"
93 #include "ihevce_function_selector.h"
94 #include "ihevce_enc_structs.h"
95 #include "ihevce_entropy_structs.h"
96 #include "ihevce_cmn_utils_instr_set_router.h"
97 #include "ihevce_enc_loop_structs.h"
98 #include "ihevce_bs_compute_ctb.h"
99 #include "ihevce_global_tables.h"
100 #include "ihevce_dep_mngr_interface.h"
101 #include "hme_datatype.h"
102 #include "hme_interface.h"
103 #include "hme_common_defs.h"
104 #include "hme_defs.h"
105 #include "ihevce_me_instr_set_router.h"
106 #include "hme_globals.h"
107 #include "hme_utils.h"
108 #include "hme_coarse.h"
109 #include "hme_fullpel.h"
110 #include "hme_subpel.h"
111 #include "hme_refine.h"
112 #include "hme_err_compute.h"
113 #include "hme_common_utils.h"
114 #include "hme_search_algo.h"
115 #include "ihevce_stasino_helpers.h"
116 #include "ihevce_common_utils.h"
117 
118 /*****************************************************************************/
119 /* Function Definitions                                                      */
120 /*****************************************************************************/
hme_qpel_interp_avg(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,S32 i4_buf_id)121 void hme_qpel_interp_avg(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
122 {
123     U08 *pu1_src1, *pu1_src2, *pu1_dst;
124     qpel_input_buf_cfg_t *ps_inp_cfg;
125     S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
126 
127     /*************************************************************************/
128     /* For a given QPEL pt, we need to determine the 2 source pts that are   */
129     /* needed to do the QPEL averaging. The logic to do this is as follows   */
130     /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are     */
131     /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0    */
132     /* pt of th reference blk that is colocated to the inp blk.              */
133     /*    A j E k B                                                          */
134     /*    l m n o p                                                          */
135     /*    F q G r H                                                          */
136     /*    s t u v w                                                          */
137     /*    C x I y D                                                          */
138     /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/
139     /* and (1,1) respectively in the fpel buffer (id = 0)                    */
140     /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf  */
141     /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf  */
142     /* G is hxhy pt in offset 0,0 in hxhy buf                                */
143     /* All above offsets are computed w.r.t. motion displaced pt in          */
144     /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and  */
145     /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G                     */
146     /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3   */
147     /* If we consider pt v to be derived. v has a fractional comp of 3, 3    */
148     /* v is avg of H and I. So the table look up of v should give following  */
149     /* buf 1 (H) : offset = (1, 0) buf id = 2.                               */
150     /* buf 2 (I) : offset = 0 , 1) buf id = 1.                               */
151     /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1.       */
152     /*************************************************************************/
153     i4_mv_x_frac = i4_mv_x & 3;
154     i4_mv_y_frac = i4_mv_y & 3;
155 
156     i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride;
157 
158     /* Derive the descriptor that has all offset and size info */
159     ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
160 
161     if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
162     {
163         /* This is case for fxfy/hxfy/fxhy/hxhy */
164         ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
165         ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
166         ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
167         ps_prms->i4_final_out_stride = ps_prms->i4_ref_stride;
168 
169         return;
170     }
171 
172     pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
173     pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
174     pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
175 
176     pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
177     pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
178     pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride);
179 
180     pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
181     hevc_avg_2d(
182         pu1_src1,
183         pu1_src2,
184         ps_prms->i4_ref_stride,
185         ps_prms->i4_ref_stride,
186         ps_prms->i4_blk_wd,
187         ps_prms->i4_blk_ht,
188         pu1_dst,
189         ps_prms->i4_out_stride);
190     ps_prms->pu1_final_out = pu1_dst;
191     ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
192 }
193 
hme_qpel_interp_avg_2pt_vert_no_reuse(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride,FT_QPEL_INTERP_AVG_1PT * pf_qpel_interp_avg_1pt)194 static __inline void hme_qpel_interp_avg_2pt_vert_no_reuse(
195     interp_prms_t *ps_prms,
196     S32 i4_mv_x,
197     S32 i4_mv_y,
198     U08 **ppu1_final,
199     S32 *pi4_final_stride,
200     FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
201 {
202     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
203 
204     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
205 }
206 
hme_qpel_interp_avg_2pt_horz_no_reuse(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride,FT_QPEL_INTERP_AVG_1PT * pf_qpel_interp_avg_1pt)207 static __inline void hme_qpel_interp_avg_2pt_horz_no_reuse(
208     interp_prms_t *ps_prms,
209     S32 i4_mv_x,
210     S32 i4_mv_y,
211     U08 **ppu1_final,
212     S32 *pi4_final_stride,
213     FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
214 {
215     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
216 
217     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
218 }
219 
220 /********************************************************************************
221 *  @fn     hme_qpel_interp_comprehensive
222 *
223 *  @brief  Interpolates 2 qpel points by hpel averaging
224 *
225 *  @param[in,out]  ps_prms: Both input buffer ptrs and location of output
226 *
227 *  @param[in]  i4_mv_x : x component of motion vector in QPEL units
228 *
229 *  @param[in]  i4_mv_y : y component of motion vector in QPEL units
230 *
231 *  @param[in]  i4_grid_mask : mask which determines qpels to be computed
232 *
233 *  @param[out]  ppu1_final : storage for final buffer pointers
234 *
235 *  @param[out]  pi4_final_stride : storage for final buffer strides
236 *
237 *  @return None
238 ********************************************************************************
239 */
hme_qpel_interp_comprehensive(interp_prms_t * ps_prms,U08 ** ppu1_final,S32 * pi4_final_stride,S32 i4_mv_x,S32 i4_mv_y,S32 i4_grid_mask,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)240 static __inline void hme_qpel_interp_comprehensive(
241     interp_prms_t *ps_prms,
242     U08 **ppu1_final,
243     S32 *pi4_final_stride,
244     S32 i4_mv_x,
245     S32 i4_mv_y,
246     S32 i4_grid_mask,
247     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
248 {
249     S32 pt_select_for_TB, pt_select_for_LR;
250     S32 dx, dy, dydx;
251     S32 vert_func_selector, horz_func_selector;
252 
253     S32 i4_ref_stride = ps_prms->i4_ref_stride;
254 
255     pt_select_for_TB =
256         ((i4_grid_mask & (1 << PT_B)) >> PT_B) + ((i4_grid_mask & (1 << PT_T)) >> (PT_T - 1));
257 
258     pt_select_for_LR =
259         ((i4_grid_mask & (1 << PT_R)) >> PT_R) + ((i4_grid_mask & (1 << PT_L)) >> (PT_L - 1));
260 
261     dx = (i4_mv_x & 3);
262     dy = (i4_mv_y & 3);
263     dydx = (dx + (dy << 2));
264 
265     vert_func_selector = gai4_select_qpel_function_vert[pt_select_for_TB][dydx];
266     horz_func_selector = gai4_select_qpel_function_horz[pt_select_for_LR][dydx];
267 
268     /* case descriptions */
269     /* Let T = (gridmask & T) & B = (gridmask & B) */
270     /* & hp = pt is an hpel or an fpel */
271     /* & r = reuse possible */
272     /* 0 => T || B = 0 */
273     /* 1 => (!T) && (B) && hp */
274     /* 2 => (T) && (!B) && hp */
275     /* 3 => (!T) && (B) && !hp */
276     /* 4 => (T) && (!B) && !hp */
277     /* 5 => (T) && (B) && !hp && r */
278     /* 6 => (T) && (B) && !hp && !r */
279     /* 7 => (T) && (B) && hp */
280 
281     switch(vert_func_selector)
282     {
283     case 0:
284     {
285         break;
286     }
287     case 1:
288     {
289         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
290         qpel_input_buf_cfg_t *ps_inp_cfg;
291         S32 i4_mvyp1 = (i4_mv_y + 1);
292 
293         i4_mv_x_frac = dx;
294         i4_mv_y_frac = i4_mvyp1 & 3;
295 
296         i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
297 
298         /* Derive the descriptor that has all offset and size info */
299         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
300 
301         ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
302         ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
303         ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
304         pi4_final_stride[3] = i4_ref_stride;
305 
306         break;
307     }
308     case 2:
309     {
310         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
311         qpel_input_buf_cfg_t *ps_inp_cfg;
312         S32 i4_mvym1 = (i4_mv_y - 1);
313 
314         i4_mv_x_frac = dx;
315         i4_mv_y_frac = i4_mvym1 & 3;
316 
317         i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
318 
319         /* Derive the descriptor that has all offset and size info */
320         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
321 
322         ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
323         ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
324         ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
325         pi4_final_stride[1] = i4_ref_stride;
326 
327         break;
328     }
329     case 3:
330     {
331         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
332             ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
333 
334         break;
335     }
336     case 4:
337     {
338         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
339             ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
340 
341         break;
342     }
343     case 5:
344     {
345         ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_vert_with_reuse(
346             ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
347         break;
348     }
349     case 6:
350     {
351         hme_qpel_interp_avg_2pt_vert_no_reuse(
352             ps_prms,
353             i4_mv_x,
354             i4_mv_y,
355             ppu1_final,
356             pi4_final_stride,
357             ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
358         break;
359     }
360     case 7:
361     {
362         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
363         qpel_input_buf_cfg_t *ps_inp_cfg;
364 
365         S32 i4_mvyp1 = (i4_mv_y + 1);
366         S32 i4_mvym1 = (i4_mv_y - 1);
367 
368         i4_mv_x_frac = dx;
369         i4_mv_y_frac = i4_mvyp1 & 3;
370 
371         i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
372 
373         /* Derive the descriptor that has all offset and size info */
374         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
375 
376         ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
377         ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
378         ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
379         pi4_final_stride[3] = i4_ref_stride;
380 
381         i4_mv_y_frac = i4_mvym1 & 3;
382 
383         i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
384 
385         /* Derive the descriptor that has all offset and size info */
386         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
387 
388         ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
389         ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
390         ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
391         pi4_final_stride[1] = i4_ref_stride;
392 
393         break;
394     }
395     }
396 
397     /* case descriptions */
398     /* Let L = (gridmask & L) & R = (gridmask & R) */
399     /* & hp = pt is an hpel or an fpel */
400     /* & r = reuse possible */
401     /* 0 => L || R = 0 */
402     /* 1 => (!L) && (R) && hp */
403     /* 2 => (L) && (!R) && hp */
404     /* 3 => (!L) && (R) && !hp */
405     /* 4 => (L) && (!R) && !hp */
406     /* 5 => (L) && (R) && !hp && r */
407     /* 6 => (L) && (R) && !hp && !r */
408     /* 7 => (L) && (R) && hp */
409 
410     switch(horz_func_selector)
411     {
412     case 0:
413     {
414         break;
415     }
416     case 1:
417     {
418         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
419         qpel_input_buf_cfg_t *ps_inp_cfg;
420         S32 i4_mvxp1 = (i4_mv_x + 1);
421 
422         i4_mv_x_frac = i4_mvxp1 & 3;
423         i4_mv_y_frac = dy;
424 
425         i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
426 
427         /* Derive the descriptor that has all offset and size info */
428         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
429 
430         ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
431         ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
432         ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
433         pi4_final_stride[2] = i4_ref_stride;
434 
435         break;
436     }
437     case 2:
438     {
439         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
440         qpel_input_buf_cfg_t *ps_inp_cfg;
441         S32 i4_mvxm1 = (i4_mv_x - 1);
442 
443         i4_mv_x_frac = i4_mvxm1 & 3;
444         i4_mv_y_frac = dy;
445 
446         i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
447 
448         /* Derive the descriptor that has all offset and size info */
449         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
450 
451         ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
452         ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
453         ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
454         pi4_final_stride[0] = i4_ref_stride;
455 
456         break;
457     }
458     case 3:
459     {
460         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
461             ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
462 
463         break;
464     }
465     case 4:
466     {
467         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
468             ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
469 
470         break;
471     }
472     case 5:
473     {
474         ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_horz_with_reuse(
475             ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
476         break;
477     }
478     case 6:
479     {
480         hme_qpel_interp_avg_2pt_horz_no_reuse(
481             ps_prms,
482             i4_mv_x,
483             i4_mv_y,
484             ppu1_final,
485             pi4_final_stride,
486             ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
487         break;
488     }
489     case 7:
490     {
491         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
492         qpel_input_buf_cfg_t *ps_inp_cfg;
493 
494         S32 i4_mvxp1 = (i4_mv_x + 1);
495         S32 i4_mvxm1 = (i4_mv_x - 1);
496 
497         i4_mv_x_frac = i4_mvxp1 & 3;
498         i4_mv_y_frac = dy;
499 
500         i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
501 
502         /* Derive the descriptor that has all offset and size info */
503         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
504 
505         ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
506         ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
507         ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
508         pi4_final_stride[2] = i4_ref_stride;
509 
510         i4_mv_x_frac = i4_mvxm1 & 3;
511 
512         i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
513 
514         /* Derive the descriptor that has all offset and size info */
515         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
516 
517         ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
518         ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
519         ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
520         pi4_final_stride[0] = i4_ref_stride;
521 
522         break;
523     }
524     }
525 }
526 
527 /**
528 ********************************************************************************
529 *  @fn     S32 hme_compute_pred_and_evaluate_bi(hme_subpel_prms_t *ps_prms,
530 *                                   search_results_t *ps_search_results,
531 *                                   layer_ctxt_t *ps_curr_layer,
532 *                                   U08 **ppu1_pred)
533 *
534 *
535 *  @brief  Evaluates the best bipred cost as avg(P0, P1) where P0 and P1 are
536 *          best L0 and L1 bufs respectively for the entire CU
537 *
538 *  @param[in]  ps_prms: subpel prms input to this function
539 *
540 *  @param[in] ps_curr_layer: points to the current layer ctxt
541 *
542 *  @return The best BI cost of best uni cost, whichever better
543 ********************************************************************************
544 */
hme_compute_pred_and_evaluate_bi(inter_cu_results_t * ps_cu_results,inter_pu_results_t * ps_pu_results,inter_ctb_prms_t * ps_inter_ctb_prms,part_type_results_t * ps_part_type_result,ULWORD64 * pu8_winning_pred_sigmaXSquare,ULWORD64 * pu8_winning_pred_sigmaX,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)545 void hme_compute_pred_and_evaluate_bi(
546     inter_cu_results_t *ps_cu_results,
547     inter_pu_results_t *ps_pu_results,
548     inter_ctb_prms_t *ps_inter_ctb_prms,
549     part_type_results_t *ps_part_type_result,
550     ULWORD64 *pu8_winning_pred_sigmaXSquare,
551     ULWORD64 *pu8_winning_pred_sigmaX,
552     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
553     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
554 {
555     /* Idx0 - Uni winner */
556     /* Idx1 - Uni runner-up */
557     /* Idx2 - Bi winner */
558     hme_pred_buf_info_t as_pred_buf_data[3][NUM_INTER_PU_PARTS];
559     err_prms_t s_err_prms;
560     interp_prms_t s_interp_prms;
561 
562     PF_SAD_FXN_T pf_err_compute;
563 
564     S32 i, j;
565     S32 x_off, y_off, x_pic, y_pic;
566     S32 i4_sad_grid;
567     U08 e_cu_size;
568     S32 i4_part_type;
569     U08 u1_cu_size;
570     S32 shift;
571     S32 x_part, y_part, num_parts;
572     S32 inp_stride, ref_stride;
573     U08 au1_pred_buf_array_indixes[3];
574     S32 cur_iter_best_cost;
575     S32 uni_cost, bi_cost, best_cost, tot_cost;
576     /* Idx0 - Uni winner */
577     /* Idx1 - Bi winner */
578     ULWORD64 au8_sigmaX[2][NUM_INTER_PU_PARTS];
579     ULWORD64 au8_sigmaXSquared[2][NUM_INTER_PU_PARTS];
580 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
581     S32 i4_noise_term;
582 #endif
583 
584     interp_prms_t *ps_interp_prms = &s_interp_prms;
585 
586     S32 best_cand_in_opp_dir_idx = 0;
587     S32 is_best_cand_an_intra = 0;
588     U08 u1_is_cu_noisy = ps_inter_ctb_prms->u1_is_cu_noisy;
589 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
590     const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
591 #endif
592     tot_cost = 0;
593 
594     /* Start of the CU w.r.t. CTB */
595     x_off = ps_cu_results->u1_x_off;
596     y_off = ps_cu_results->u1_y_off;
597 
598     inp_stride = ps_inter_ctb_prms->i4_inp_stride;
599     ref_stride = ps_inter_ctb_prms->i4_rec_stride;
600 
601     ps_interp_prms->i4_ref_stride = ref_stride;
602 
603     /* Start of the CU w.r.t. Pic 0,0 */
604     x_pic = x_off + ps_inter_ctb_prms->i4_ctb_x_off;
605     y_pic = y_off + ps_inter_ctb_prms->i4_ctb_y_off;
606 
607     u1_cu_size = ps_cu_results->u1_cu_size;
608     e_cu_size = u1_cu_size;
609     shift = (S32)e_cu_size;
610     i4_part_type = ps_part_type_result->u1_part_type;
611     num_parts = gau1_num_parts_in_part_type[i4_part_type];
612 
613     for(i = 0; i < 3; i++)
614     {
615         hme_init_pred_buf_info(
616             &as_pred_buf_data[i],
617             &ps_inter_ctb_prms->s_pred_buf_mngr,
618             (ps_part_type_result->as_pu_results->pu.b4_wd + 1) << 2,
619             (ps_part_type_result->as_pu_results->pu.b4_ht + 1) << 2,
620             (PART_TYPE_T)i4_part_type);
621 
622         au1_pred_buf_array_indixes[i] = as_pred_buf_data[i][0].u1_pred_buf_array_id;
623     }
624 
625     for(j = 0; j < num_parts; j++)
626     {
627         UWORD8 *apu1_hpel_ref[2][4];
628         PART_ID_T e_part_id;
629         BLK_SIZE_T e_blk_size;
630         WORD8 i1_ref_idx;
631         UWORD8 pred_dir;
632         WORD32 ref_offset, inp_offset, wd, ht;
633         pu_result_t *ps_pu_node1, *ps_pu_node2, *ps_pu_result;
634         mv_t *aps_mv[2];
635         UWORD8 num_active_ref_opp;
636         UWORD8 num_results_per_part;
637         WORD32 luma_weight_ref1, luma_offset_ref1;
638         WORD32 luma_weight_ref2, luma_offset_ref2;
639         WORD32 pu_node2_found = 0;
640 
641         e_part_id = ge_part_type_to_part_id[i4_part_type][j];
642         e_blk_size = ge_part_id_to_blk_size[e_cu_size][e_part_id];
643 
644         x_part = gas_part_attr_in_cu[e_part_id].u1_x_start << shift;
645         y_part = gas_part_attr_in_cu[e_part_id].u1_y_start << shift;
646 
647         ref_offset = (x_part + x_pic) + (y_pic + y_part) * ref_stride;
648         inp_offset = (x_part + y_part * inp_stride) + ps_cu_results->i4_inp_offset;
649 
650         pred_dir = ps_part_type_result->as_pu_results[j].pu.b2_pred_mode;
651 
652         ps_pu_node1 = &(ps_part_type_result->as_pu_results[j]);
653 
654         if(PRED_L0 == pred_dir)
655         {
656             i1_ref_idx = ps_pu_node1->pu.mv.i1_l0_ref_idx;
657             aps_mv[0] = &(ps_pu_node1->pu.mv.s_l0_mv);
658 
659             num_active_ref_opp =
660                 ps_inter_ctb_prms->u1_num_active_ref_l1 * (ps_inter_ctb_prms->i4_bidir_enabled);
661             num_results_per_part = ps_pu_results->u1_num_results_per_part_l0[e_part_id];
662 
663             ps_pu_result = ps_pu_results->aps_pu_results[PRED_L0][e_part_id];
664 
665             ASSERT(i1_ref_idx >= 0);
666 
667             apu1_hpel_ref[0][0] =
668                 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
669                 ref_offset;
670             apu1_hpel_ref[0][1] =
671                 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
672                 ref_offset;
673             apu1_hpel_ref[0][2] =
674                 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
675                 ref_offset;
676             apu1_hpel_ref[0][3] =
677                 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
678                 ref_offset;
679 
680             luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
681                                    ->s_weight_offset.i2_luma_weight;
682             luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
683                                    ->s_weight_offset.i2_luma_offset;
684         }
685         else
686         {
687             i1_ref_idx = ps_pu_node1->pu.mv.i1_l1_ref_idx;
688             aps_mv[0] = &(ps_pu_node1->pu.mv.s_l1_mv);
689 
690             ASSERT(i1_ref_idx >= 0);
691 
692             num_active_ref_opp =
693                 ps_inter_ctb_prms->u1_num_active_ref_l0 * (ps_inter_ctb_prms->i4_bidir_enabled);
694             num_results_per_part = ps_pu_results->u1_num_results_per_part_l1[e_part_id];
695 
696             ps_pu_result = ps_pu_results->aps_pu_results[PRED_L1][e_part_id];
697 
698             apu1_hpel_ref[0][0] =
699                 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
700                 ref_offset;
701             apu1_hpel_ref[0][1] =
702                 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
703                 ref_offset;
704             apu1_hpel_ref[0][2] =
705                 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
706                 ref_offset;
707             apu1_hpel_ref[0][3] =
708                 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
709                 ref_offset;
710 
711             luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
712                                    ->s_weight_offset.i2_luma_weight;
713             luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
714                                    ->s_weight_offset.i2_luma_offset;
715         }
716 
717         if(aps_mv[0]->i2_mvx == INTRA_MV)
718         {
719             uni_cost = ps_pu_node1->i4_tot_cost;
720             cur_iter_best_cost = ps_pu_node1->i4_tot_cost;
721             best_cost = MIN(uni_cost, cur_iter_best_cost);
722             tot_cost += best_cost;
723             continue;
724         }
725 
726         ps_interp_prms->i4_blk_wd = wd = gau1_blk_size_to_wd[e_blk_size];
727         ps_interp_prms->i4_blk_ht = ht = gau1_blk_size_to_ht[e_blk_size];
728         ps_interp_prms->i4_out_stride = MAX_CU_SIZE;
729 
730         if(num_active_ref_opp)
731         {
732             if(PRED_L0 == pred_dir)
733             {
734                 if(ps_pu_results->u1_num_results_per_part_l1[e_part_id])
735                 {
736                     ps_pu_node2 = ps_pu_results->aps_pu_results[1][e_part_id];
737                     pu_node2_found = 1;
738                 }
739             }
740             else
741             {
742                 if(ps_pu_results->u1_num_results_per_part_l0[e_part_id])
743                 {
744                     ps_pu_node2 = ps_pu_results->aps_pu_results[0][e_part_id];
745                     pu_node2_found = 1;
746                 }
747             }
748         }
749 
750         if(!pu_node2_found)
751         {
752             bi_cost = INT_MAX >> 1;
753 
754             s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
755             ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
756 
757             ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
758                 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
759 
760             if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
761             {
762                 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
763                 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
764                 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
765             }
766 
767             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
768             {
769                 hme_compute_sigmaX_and_sigmaXSquared(
770                     as_pred_buf_data[0][j].pu1_pred,
771                     as_pred_buf_data[0][j].i4_pred_stride,
772                     &au8_sigmaX[0][j],
773                     &au8_sigmaXSquared[0][j],
774                     ps_interp_prms->i4_blk_wd,
775                     ps_interp_prms->i4_blk_ht,
776                     ps_interp_prms->i4_blk_wd,
777                     ps_interp_prms->i4_blk_ht,
778                     0,
779                     1);
780             }
781         }
782         else
783         {
784             i = 0;
785             bi_cost = MAX_32BIT_VAL;
786             is_best_cand_an_intra = 0;
787             best_cand_in_opp_dir_idx = 0;
788 
789             pred_dir = ps_pu_node2[i].pu.b2_pred_mode;
790 
791             if(PRED_L0 == pred_dir)
792             {
793                 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l0_ref_idx;
794                 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l0_mv);
795 
796                 ASSERT(i1_ref_idx >= 0);
797 
798                 apu1_hpel_ref[1][0] =
799                     (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
800                                    ->s_yuv_buf_desc.pv_y_buf) +
801                     ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
802                 apu1_hpel_ref[1][1] =
803                     ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
804                     ref_offset;
805                 apu1_hpel_ref[1][2] =
806                     ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
807                     ref_offset;
808                 apu1_hpel_ref[1][3] =
809                     ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
810                     ref_offset;
811 
812                 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
813                                        ->s_weight_offset.i2_luma_weight;
814                 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
815                                        ->s_weight_offset.i2_luma_offset;
816             }
817             else
818             {
819                 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l1_ref_idx;
820                 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l1_mv);
821 
822                 ASSERT(i1_ref_idx >= 0);
823 
824                 apu1_hpel_ref[1][0] =
825                     (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
826                                    ->s_yuv_buf_desc.pv_y_buf) +
827                     ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
828                 apu1_hpel_ref[1][1] =
829                     ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
830                     ref_offset;
831                 apu1_hpel_ref[1][2] =
832                     ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
833                     ref_offset;
834                 apu1_hpel_ref[1][3] =
835                     ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
836                     ref_offset;
837 
838                 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
839                                        ->s_weight_offset.i2_luma_weight;
840                 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
841                                        ->s_weight_offset.i2_luma_offset;
842             }
843 
844             if(aps_mv[1]->i2_mvx == INTRA_MV)
845             {
846                 uni_cost = ps_pu_node1->i4_tot_cost;
847                 cur_iter_best_cost = ps_pu_node2[i].i4_tot_cost;
848 
849                 if(cur_iter_best_cost < bi_cost)
850                 {
851                     bi_cost = cur_iter_best_cost;
852                     best_cand_in_opp_dir_idx = i;
853                     is_best_cand_an_intra = 1;
854                 }
855 
856                 best_cost = MIN(uni_cost, bi_cost);
857                 tot_cost += best_cost;
858                 continue;
859             }
860 
861             s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
862             ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
863 
864             ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
865                 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
866 
867             if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
868             {
869                 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
870                 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
871                 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
872             }
873 
874             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
875             {
876                 hme_compute_sigmaX_and_sigmaXSquared(
877                     as_pred_buf_data[0][j].pu1_pred,
878                     as_pred_buf_data[0][j].i4_pred_stride,
879                     &au8_sigmaX[0][j],
880                     &au8_sigmaXSquared[0][j],
881                     ps_interp_prms->i4_blk_wd,
882                     ps_interp_prms->i4_blk_ht,
883                     ps_interp_prms->i4_blk_wd,
884                     ps_interp_prms->i4_blk_ht,
885                     0,
886                     1);
887             }
888 
889             s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[1][j].pu1_pred;
890             ps_interp_prms->ppu1_ref = &apu1_hpel_ref[1][0];
891 
892             ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
893                 ps_interp_prms, aps_mv[1]->i2_mvx, aps_mv[1]->i2_mvy, 0);
894 
895             if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
896             {
897                 as_pred_buf_data[1][j].u1_pred_buf_array_id = UCHAR_MAX;
898                 as_pred_buf_data[1][j].pu1_pred = ps_interp_prms->pu1_final_out;
899                 as_pred_buf_data[1][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
900             }
901 
902             ps_cmn_utils_optimised_function_list->pf_wt_avg_2d(
903                 as_pred_buf_data[0][j].pu1_pred,
904                 as_pred_buf_data[1][j].pu1_pred,
905                 as_pred_buf_data[0][j].i4_pred_stride,
906                 as_pred_buf_data[1][j].i4_pred_stride,
907                 wd,
908                 ht,
909                 as_pred_buf_data[2][j].pu1_pred,
910                 as_pred_buf_data[2][j].i4_pred_stride,
911                 luma_weight_ref1,
912                 luma_weight_ref2,
913                 luma_offset_ref1,
914                 luma_offset_ref2,
915                 ps_inter_ctb_prms->wpred_log_wdc);
916 
917             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
918             {
919                 hme_compute_sigmaX_and_sigmaXSquared(
920                     as_pred_buf_data[2][j].pu1_pred,
921                     as_pred_buf_data[2][j].i4_pred_stride,
922                     &au8_sigmaX[1][j],
923                     &au8_sigmaXSquared[1][j],
924                     ps_interp_prms->i4_blk_wd,
925                     ps_interp_prms->i4_blk_ht,
926                     ps_interp_prms->i4_blk_wd,
927                     ps_interp_prms->i4_blk_ht,
928                     0,
929                     1);
930             }
931 
932             s_err_prms.pu1_inp = (U08 *)ps_inter_ctb_prms->pu1_non_wt_inp + inp_offset;
933             s_err_prms.i4_inp_stride = inp_stride;
934             s_err_prms.i4_ref_stride = as_pred_buf_data[2][j].i4_pred_stride;
935             s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
936             s_err_prms.i4_grid_mask = 1;
937             s_err_prms.pi4_sad_grid = &i4_sad_grid;
938             s_err_prms.i4_blk_wd = wd;
939             s_err_prms.i4_blk_ht = ht;
940             s_err_prms.pu1_ref = as_pred_buf_data[2][j].pu1_pred;
941             s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list;
942 
943             if(ps_inter_ctb_prms->u1_use_satd)
944             {
945                 pf_err_compute = compute_satd_8bit;
946             }
947             else
948             {
949                 pf_err_compute = ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit;
950             }
951 
952             pf_err_compute(&s_err_prms);
953 
954 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
955             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
956             {
957                 unsigned long u4_shift_val;
958                 ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
959                 ULWORD64 u8_temp_var, u8_temp_var1;
960                 S32 i4_bits_req;
961 
962                 S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
963 
964                 u8_pred_sigmaSquareX = (au8_sigmaX[1][j] * au8_sigmaX[1][j]);
965                 u8_pred_variance = au8_sigmaXSquared[1][j] - u8_pred_sigmaSquareX;
966 
967                 if(e_cu_size == CU_8x8)
968                 {
969                     PART_ID_T e_part_id =
970                         (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
971 
972                     u4_shift_val = ihevce_calc_stim_injected_variance(
973                         ps_inter_ctb_prms->pu8_part_src_sigmaX,
974                         ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
975                         &u8_src_variance,
976                         i4_default_src_wt,
977                         0,
978                         ps_inter_ctb_prms->wpred_log_wdc,
979                         e_part_id);
980                 }
981                 else
982                 {
983                     u4_shift_val = ihevce_calc_stim_injected_variance(
984                         ps_inter_ctb_prms->pu8_part_src_sigmaX,
985                         ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
986                         &u8_src_variance,
987                         i4_default_src_wt,
988                         0,
989                         ps_inter_ctb_prms->wpred_log_wdc,
990                         e_part_id);
991                 }
992 
993                 u8_pred_variance = u8_pred_variance >> u4_shift_val;
994 
995                 GETRANGE64(i4_bits_req, u8_pred_variance);
996 
997                 if(i4_bits_req > 27)
998                 {
999                     u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1000                     u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1001                 }
1002 
1003                 if(u8_src_variance == u8_pred_variance)
1004                 {
1005                     u8_temp_var = (1 << STIM_Q_FORMAT);
1006                 }
1007                 else
1008                 {
1009                     u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1010                     u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1011                     u8_temp_var1 =
1012                         (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1013                     u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1014                     u8_temp_var = (u8_temp_var / u8_temp_var1);
1015                 }
1016 
1017                 i4_noise_term = (UWORD32)u8_temp_var;
1018 
1019                 i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1020 
1021                 ASSERT(i4_noise_term >= 0);
1022 
1023                 u8_temp_var = i4_sad_grid;
1024                 u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1025                 u8_temp_var += (1 << ((i4_q_level)-1));
1026                 i4_sad_grid = (UWORD32)(u8_temp_var >> (i4_q_level));
1027             }
1028 #endif
1029 
1030             cur_iter_best_cost = i4_sad_grid;
1031             cur_iter_best_cost += ps_pu_node1->i4_mv_cost;
1032             cur_iter_best_cost += ps_pu_node2[i].i4_mv_cost;
1033 
1034             if(cur_iter_best_cost < bi_cost)
1035             {
1036                 bi_cost = cur_iter_best_cost;
1037                 best_cand_in_opp_dir_idx = i;
1038                 is_best_cand_an_intra = 0;
1039             }
1040         }
1041 
1042         uni_cost = ps_pu_node1->i4_tot_cost;
1043 
1044 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
1045         if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1046         {
1047             unsigned long u4_shift_val;
1048             ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
1049             ULWORD64 u8_temp_var, u8_temp_var1;
1050             S32 i4_bits_req;
1051 
1052             S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
1053 
1054             S08 i1_ref_idx =
1055                 (PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1056                     ? ps_inter_ctb_prms->pi1_past_list[ps_pu_node1->pu.mv.i1_l0_ref_idx]
1057                     : ps_inter_ctb_prms->pi1_future_list[ps_pu_node1->pu.mv.i1_l1_ref_idx];
1058             S32 i4_sad = ps_pu_node1->i4_tot_cost - ps_pu_node1->i4_mv_cost;
1059 
1060             u8_pred_sigmaSquareX = (au8_sigmaX[0][j] * au8_sigmaX[0][j]);
1061             u8_pred_variance = au8_sigmaXSquared[0][j] - u8_pred_sigmaSquareX;
1062 
1063             if(e_cu_size == CU_8x8)
1064             {
1065                 PART_ID_T e_part_id =
1066                     (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
1067 
1068                 u4_shift_val = ihevce_calc_stim_injected_variance(
1069                     ps_inter_ctb_prms->pu8_part_src_sigmaX,
1070                     ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1071                     &u8_src_variance,
1072                     ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1073                     ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1074                     ps_inter_ctb_prms->wpred_log_wdc,
1075                     e_part_id);
1076             }
1077             else
1078             {
1079                 u4_shift_val = ihevce_calc_stim_injected_variance(
1080                     ps_inter_ctb_prms->pu8_part_src_sigmaX,
1081                     ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1082                     &u8_src_variance,
1083                     ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1084                     ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1085                     ps_inter_ctb_prms->wpred_log_wdc,
1086                     e_part_id);
1087             }
1088 
1089             u8_pred_variance = u8_pred_variance >> (u4_shift_val);
1090 
1091             GETRANGE64(i4_bits_req, u8_pred_variance);
1092 
1093             if(i4_bits_req > 27)
1094             {
1095                 u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1096                 u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1097             }
1098 
1099             if(u8_src_variance == u8_pred_variance)
1100             {
1101                 u8_temp_var = (1 << STIM_Q_FORMAT);
1102             }
1103             else
1104             {
1105                 u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1106                 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1107                 u8_temp_var1 =
1108                     (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1109                 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1110                 u8_temp_var = (u8_temp_var / u8_temp_var1);
1111             }
1112 
1113             i4_noise_term = (UWORD32)u8_temp_var;
1114 
1115             i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1116 
1117             ASSERT(i4_noise_term >= 0);
1118 
1119             u8_temp_var = i4_sad;
1120             u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1121             u8_temp_var += (1 << ((i4_q_level)-1));
1122             i4_sad = (UWORD32)(u8_temp_var >> (i4_q_level));
1123 
1124             uni_cost = i4_sad + ps_pu_node1->i4_mv_cost;
1125 
1126             pu8_winning_pred_sigmaX[j] = au8_sigmaX[0][j];
1127             pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[0][j];
1128         }
1129 #endif
1130 
1131         if((bi_cost < uni_cost) && (!is_best_cand_an_intra))
1132         {
1133             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1134             {
1135                 pu8_winning_pred_sigmaX[j] = au8_sigmaX[1][j];
1136                 pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[1][j];
1137             }
1138 
1139             if(PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1140             {
1141                 ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1142 
1143                 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1144                 {
1145                     ps_pu_node1->pu.mv.i1_l1_ref_idx =
1146                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1147                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1148                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1149                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1150                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1151                 }
1152                 else
1153                 {
1154                     ps_pu_node1->pu.mv.i1_l1_ref_idx =
1155                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1156                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1157                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1158                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1159                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1160                 }
1161             }
1162             else
1163             {
1164                 ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1165 
1166                 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1167                 {
1168                     ps_pu_node1->pu.mv.i1_l0_ref_idx =
1169                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1170                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1171                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1172                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1173                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1174                 }
1175                 else
1176                 {
1177                     ps_pu_node1->pu.mv.i1_l0_ref_idx =
1178                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1179                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1180                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1181                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1182                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1183                 }
1184             }
1185 
1186             ps_part_type_result->as_pu_results[j].i4_tot_cost = bi_cost;
1187         }
1188 
1189         best_cost = MIN(uni_cost, bi_cost);
1190         tot_cost += best_cost;
1191     }
1192 
1193     hme_debrief_bipred_eval(
1194         ps_part_type_result,
1195         as_pred_buf_data,
1196         &ps_inter_ctb_prms->s_pred_buf_mngr,
1197         au1_pred_buf_array_indixes,
1198         ps_cmn_utils_optimised_function_list);
1199 
1200     ps_part_type_result->i4_tot_cost = tot_cost;
1201 }
1202 
hme_evalsatd_pt_pu_8x8_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)1203 WORD32 hme_evalsatd_pt_pu_8x8_tu_rec(
1204     err_prms_t *ps_prms,
1205     WORD32 lambda,
1206     WORD32 lambda_q_shift,
1207     WORD32 i4_frm_qstep,
1208     me_func_selector_t *ps_func_selector)
1209 {
1210     S32 ai4_satd_4x4[4]; /* num 4x4s in a 8x8 */
1211     S32 i4_satd_8x8;
1212     S16 *pi2_had_out;
1213     S32 i4_tu_split_flag = 0;
1214     S32 i4_tu_early_cbf = 0;
1215 
1216     S32 i4_early_cbf = 1;
1217     //  S32 i4_i, i4_k;
1218     S32 i4_total_satd_cost = 0;
1219     S32 best_cost_tu_split;
1220 
1221     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1222     S32 *api4_satd_pu[HAD_32x32 + 1];
1223     S32 *api4_tu_split[HAD_32x32 + 1];
1224     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1225 
1226     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1227     S32 *pi4_tu_split = ps_prms->pi4_tu_split_flags;
1228     S32 *pi4_early_cbf = ps_prms->pi4_tu_early_cbf;
1229 
1230     U08 *pu1_inp = ps_prms->pu1_inp;
1231     U08 *pu1_ref = ps_prms->pu1_ref;
1232 
1233     S32 inp_stride = ps_prms->i4_inp_stride;
1234     S32 ref_stride = ps_prms->i4_ref_stride;
1235 
1236     /* Initialize tu_split_cost to "0" */
1237     ps_prms->i4_tu_split_cost = 0;
1238     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1239 
1240     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1241     api4_satd_pu[HAD_8x8] = &i4_satd_8x8;
1242     api4_satd_pu[HAD_16x16] = NULL;
1243     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1244 
1245     api4_tu_split[HAD_4x4] = NULL;
1246     api4_tu_split[HAD_8x8] = &i4_tu_split_flag;
1247     api4_tu_split[HAD_16x16] = NULL;
1248     api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1249 
1250     api4_tu_early_cbf[HAD_4x4] = NULL;
1251     api4_tu_early_cbf[HAD_8x8] = &i4_tu_early_cbf;
1252     api4_tu_early_cbf[HAD_16x16] = NULL;
1253     api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1254 
1255     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1256 
1257     /* Return value is merge of both best_stad_cost and tu_split_flags */
1258     best_cost_tu_split = ps_func_selector->pf_had_8x8_using_4_4x4_r(
1259         pu1_inp,
1260         inp_stride,
1261         pu1_ref,
1262         ref_stride,
1263         pi2_had_out,
1264         8,
1265         api4_satd_pu,
1266         api4_tu_split,
1267         api4_tu_early_cbf,
1268         0,
1269         2,
1270         0,
1271         0,
1272         i4_frm_qstep,
1273         0,
1274         ps_prms->u1_max_tr_depth,
1275         ps_prms->u1_max_tr_size,
1276         &(ps_prms->i4_tu_split_cost),
1277         NULL);
1278 
1279     /* For SATD computation following TU size are assumed for a 8x8 CU */
1280     /* 8 for 2Nx2N, 4 for Nx2N,2NxN                                    */
1281 
1282     i4_total_satd_cost = best_cost_tu_split >> 2;
1283 
1284     /* Second last bit has the tu pslit flag */
1285     i4_tu_split_flag = (best_cost_tu_split & 0x3) >> 1;
1286 
1287     /* Last bit corrsponds to the Early CBF flag */
1288     i4_early_cbf = (best_cost_tu_split & 0x1);
1289 
1290     /* Update 8x8 SATDs */
1291     pi4_sad_grid[PART_ID_2Nx2N] = i4_satd_8x8;
1292     pi4_tu_split[PART_ID_2Nx2N] = i4_tu_split_flag;
1293     pi4_early_cbf[PART_ID_2Nx2N] = i4_early_cbf;
1294 
1295     return i4_total_satd_cost;
1296 }
1297 //#endif
1298 /**
1299 ********************************************************************************
1300 *  @fn     S32 hme_evalsatd_update_1_best_result_pt_pu_16x16
1301 *
1302 *  @brief  Evaluates the SATD with partial updates for all the best partitions
1303 *          of a 16x16 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1304 *
1305 *  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1306 *                 pointer to sad grid of each partitions
1307 *
1308 *  @return     None
1309 ********************************************************************************
1310 */
1311 
hme_evalsatd_update_2_best_results_pt_pu_16x16(err_prms_t * ps_prms,result_upd_prms_t * ps_result_prms)1312 void hme_evalsatd_update_2_best_results_pt_pu_16x16(
1313     err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1314 {
1315     S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1316     S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1317     S32 i4_satd_16x16; /* 16x16 satd cost     */
1318     S32 i;
1319     S16 ai2_8x8_had[256];
1320     S16 *pi2_y0;
1321     U08 *pu1_src, *pu1_pred;
1322     S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1323     S32 *ppi4_hsad;
1324 
1325     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1326     S32 *api4_satd_pu[HAD_32x32 + 1];
1327     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1328 
1329     U08 *pu1_inp = ps_prms->pu1_inp;
1330     U08 *pu1_ref = ps_prms->pu1_ref;
1331 
1332     S32 inp_stride = ps_prms->i4_inp_stride;
1333     S32 ref_stride = ps_prms->i4_ref_stride;
1334 
1335     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1336     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1337     api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1338     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1339 
1340     ppi4_hsad = api4_satd_pu[HAD_16x16];
1341 
1342     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1343     for(i = 0; i < 4; i++)
1344     {
1345         pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1346         pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1347         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1348         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1349 
1350         ihevce_had_8x8_using_4_4x4(
1351             pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1352     }
1353 
1354     /* For SATD computation following TU size are assumed for a 16x16 CU */
1355     /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1356 
1357     /* Update 8x8 SATDs */
1358     /* Modified to cost calculation using only 4x4 SATD */
1359 
1360     //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1361     //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1362     //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1363     //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1364 
1365     /* Update 16x16 SATDs */
1366     pi4_sad_grid[PART_ID_2Nx2N] =
1367         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1368 
1369     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1370     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1371     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1372     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1373 
1374     /* Update 8x16 / 16x8 SATDs */
1375     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1376     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1377     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1378     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1379 
1380     /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1381     pi4_sad_grid[PART_ID_nLx2N_L] =
1382         ai4_satd_4x4[0] + ai4_satd_4x4[4] + ai4_satd_4x4[8] + ai4_satd_4x4[12];
1383 
1384     pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_4x4[1] + ai4_satd_4x4[5] + ai4_satd_4x4[9] +
1385                                     ai4_satd_4x4[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1386 
1387     pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_4x4[2] + ai4_satd_4x4[6] + ai4_satd_4x4[10] +
1388                                     ai4_satd_4x4[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1389 
1390     pi4_sad_grid[PART_ID_nRx2N_R] =
1391         ai4_satd_4x4[3] + ai4_satd_4x4[7] + ai4_satd_4x4[11] + ai4_satd_4x4[15];
1392 
1393     pi4_sad_grid[PART_ID_2NxnU_T] =
1394         ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[2] + ai4_satd_4x4[3];
1395 
1396     pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_4x4[4] + ai4_satd_4x4[5] + ai4_satd_4x4[6] +
1397                                     ai4_satd_4x4[7] + pi4_sad_grid[PART_ID_2NxN_B];
1398 
1399     pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[10] +
1400                                     ai4_satd_4x4[11] + pi4_sad_grid[PART_ID_2NxN_T];
1401 
1402     pi4_sad_grid[PART_ID_2NxnD_B] =
1403         ai4_satd_4x4[12] + ai4_satd_4x4[13] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1404 
1405     /* Call the update results function */
1406     {
1407         S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1408         mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1409         S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1410         S32 best_node_cost;
1411         S32 second_best_node_cost;
1412 
1413         /*For each valid partition, update the refine_prm structure to reflect the best and second
1414         best candidates for that partition*/
1415 
1416         for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1417         {
1418             S32 update_required = 0;
1419             S32 part_id = pi4_valid_part_ids[i4_count];
1420             S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1421 
1422             /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1423             i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1424 
1425             /*Calculate total cost*/
1426             i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1427             i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1428 
1429             /*****************************************************************/
1430             /* We do not labor through the results if the total cost worse   */
1431             /* than the last of the results.                                 */
1432             /*****************************************************************/
1433             best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1434             second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
1435 
1436             if(i4_tot_cost < second_best_node_cost)
1437             {
1438                 update_required = 2;
1439 
1440                 /*************************************************************/
1441                 /* Identify where the current result isto be placed.Basically*/
1442                 /* find the node which has cost just higher thannodeundertest*/
1443                 /*************************************************************/
1444                 if(i4_tot_cost < best_node_cost)
1445                 {
1446                     update_required = 1;
1447                 }
1448                 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1449                 {
1450                     update_required = 0;
1451                 }
1452                 if(update_required == 2)
1453                 {
1454                     ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1455                     ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1456                     ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1457                     ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1458                     ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1459                 }
1460                 else if(update_required == 1)
1461                 {
1462                     ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
1463                         ps_subpel_refine_ctxt->i2_tot_cost[0][index];
1464                     ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
1465                         ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1466                     ps_subpel_refine_ctxt->i2_mv_x[1][index] =
1467                         ps_subpel_refine_ctxt->i2_mv_x[0][index];
1468                     ps_subpel_refine_ctxt->i2_mv_y[1][index] =
1469                         ps_subpel_refine_ctxt->i2_mv_y[0][index];
1470                     ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
1471                         ps_subpel_refine_ctxt->i2_ref_idx[0][index];
1472 
1473                     ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1474                     ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1475                     ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1476                     ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1477                     ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1478                 }
1479             }
1480         }
1481     }
1482 }
1483 
1484 //#if COMPUTE_16x16_R == C
hme_evalsatd_update_1_best_result_pt_pu_16x16(err_prms_t * ps_prms,result_upd_prms_t * ps_result_prms)1485 void hme_evalsatd_update_1_best_result_pt_pu_16x16(
1486     err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1487 {
1488     S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1489     S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1490     S32 i4_satd_16x16; /* 16x16 satd cost     */
1491     S32 i;
1492     S16 ai2_8x8_had[256];
1493     S16 *pi2_y0;
1494     U08 *pu1_src, *pu1_pred;
1495     S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1496     S32 *ppi4_hsad;
1497 
1498     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1499     S32 *api4_satd_pu[HAD_32x32 + 1];
1500     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1501 
1502     U08 *pu1_inp = ps_prms->pu1_inp;
1503     U08 *pu1_ref = ps_prms->pu1_ref;
1504 
1505     S32 inp_stride = ps_prms->i4_inp_stride;
1506     S32 ref_stride = ps_prms->i4_ref_stride;
1507 
1508     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1509     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1510     api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1511     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1512 
1513     ppi4_hsad = api4_satd_pu[HAD_16x16];
1514 
1515     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1516     for(i = 0; i < 4; i++)
1517     {
1518         pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1519         pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1520         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1521         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1522 
1523         ihevce_had_8x8_using_4_4x4(
1524             pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1525     }
1526 
1527     /* For SATD computation following TU size are assumed for a 16x16 CU */
1528     /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1529 
1530     /* Update 8x8 SATDs */
1531     /* Modified to cost calculation using only 4x4 SATD */
1532 
1533     //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1534     //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1535     //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1536     //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1537 
1538     /* Update 16x16 SATDs */
1539     pi4_sad_grid[PART_ID_2Nx2N] =
1540         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1541 
1542     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1543     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1544     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1545     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1546 
1547     /* Update 8x16 / 16x8 SATDs */
1548     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1549     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1550     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1551     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1552 
1553     /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1554     pi4_sad_grid[PART_ID_nLx2N_L] =
1555         ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
1556     pi4_sad_grid[PART_ID_nRx2N_R] =
1557         ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
1558     pi4_sad_grid[PART_ID_2NxnU_T] =
1559         ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1560     pi4_sad_grid[PART_ID_2NxnD_B] =
1561         ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1562 
1563     pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
1564     pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
1565     pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
1566     pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
1567 
1568     /* Call the update results function */
1569     {
1570         S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1571         mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1572         S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1573         S32 best_node_cost;
1574         S32 second_best_node_cost;
1575 
1576         /*For each valid partition, update the refine_prm structure to reflect the best and second
1577         best candidates for that partition*/
1578 
1579         for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1580         {
1581             S32 update_required = 0;
1582             S32 part_id = pi4_valid_part_ids[i4_count];
1583             S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1584 
1585             /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1586             i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1587 
1588             /*Calculate total cost*/
1589             i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1590             i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1591 
1592             /*****************************************************************/
1593             /* We do not labor through the results if the total cost worse   */
1594             /* than the last of the results.                                 */
1595             /*****************************************************************/
1596             best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1597             second_best_node_cost = SHRT_MAX;
1598 
1599             if(i4_tot_cost < second_best_node_cost)
1600             {
1601                 update_required = 0;
1602 
1603                 /*************************************************************/
1604                 /* Identify where the current result isto be placed.Basically*/
1605                 /* find the node which has cost just higher thannodeundertest*/
1606                 /*************************************************************/
1607                 if(i4_tot_cost < best_node_cost)
1608                 {
1609                     update_required = 1;
1610                 }
1611                 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1612                 {
1613                     update_required = 0;
1614                 }
1615                 if(update_required == 2)
1616                 {
1617                     ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1618                     ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1619                     ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1620                     ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1621                     ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1622                 }
1623                 else if(update_required == 1)
1624                 {
1625                     ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1626                     ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1627                     ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1628                     ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1629                     ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1630                 }
1631             }
1632         }
1633     }
1634 }
1635 
hme_evalsatd_pt_pu_16x16_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)1636 WORD32 hme_evalsatd_pt_pu_16x16_tu_rec(
1637     err_prms_t *ps_prms,
1638     WORD32 lambda,
1639     WORD32 lambda_q_shift,
1640     WORD32 i4_frm_qstep,
1641     me_func_selector_t *ps_func_selector)
1642 {
1643     S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1644     S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1645     S32 ai4_tu_split_8x8[16];
1646     S32 i4_satd_16x16; /* 16x16 satd cost     */
1647 
1648     S32 ai4_tu_early_cbf_8x8[16];
1649 
1650     //S16 ai2_had_out[256];
1651     S16 *pi2_had_out;
1652     S32 tu_split_flag = 0;
1653     S32 early_cbf_flag = 0;
1654     S32 total_satd_cost = 0;
1655 
1656     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1657     S32 *api4_satd_pu[HAD_32x32 + 1];
1658     S32 *api4_tu_split[HAD_32x32 + 1];
1659     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1660 
1661     U08 *pu1_inp = ps_prms->pu1_inp;
1662     U08 *pu1_ref = ps_prms->pu1_ref;
1663 
1664     S32 inp_stride = ps_prms->i4_inp_stride;
1665     S32 ref_stride = ps_prms->i4_ref_stride;
1666 
1667     /* Initialize tu_split_cost to "0" */
1668     ps_prms->i4_tu_split_cost = 0;
1669 
1670     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1671 
1672     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1673     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1674     api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1675     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1676 
1677     api4_tu_split[HAD_4x4] = NULL;
1678     api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1679     api4_tu_split[HAD_16x16] = &tu_split_flag;
1680     api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1681 
1682     api4_tu_early_cbf[HAD_4x4] = NULL;
1683     api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1684     api4_tu_early_cbf[HAD_16x16] = &early_cbf_flag;
1685     api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1686 
1687     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1688     ps_func_selector->pf_had_16x16_r(
1689         pu1_inp,
1690         inp_stride,
1691         pu1_ref,
1692         ref_stride,
1693         pi2_had_out,
1694         16,
1695         api4_satd_pu,
1696         api4_tu_split,
1697         api4_tu_early_cbf,
1698         0,
1699         4,
1700         lambda,
1701         lambda_q_shift,
1702         i4_frm_qstep,
1703         0,
1704         ps_prms->u1_max_tr_depth,
1705         ps_prms->u1_max_tr_size,
1706         &(ps_prms->i4_tu_split_cost),
1707         NULL);
1708 
1709     total_satd_cost = i4_satd_16x16;
1710 
1711     ps_prms->pi4_tu_split_flags[0] = tu_split_flag;
1712 
1713     ps_prms->pi4_tu_early_cbf[0] = early_cbf_flag;
1714 
1715     return total_satd_cost;
1716 }
1717 
1718 /**
1719 ********************************************************************************
1720 *  @fn     S32 hme_evalsatd_pt_pu_32x32
1721 *
1722 *  @brief  Evaluates the SATD with partial updates for all the best partitions
1723 *          of a 32x32 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1724 *
1725 *  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1726 *                 pointer to sad grid of each partitions
1727 *
1728 *  @return     None
1729 ********************************************************************************
1730 */
hme_evalsatd_pt_pu_32x32(err_prms_t * ps_prms)1731 void hme_evalsatd_pt_pu_32x32(err_prms_t *ps_prms)
1732 {
1733     //S32 ai4_satd_4x4[64];   /* num 4x4s in a 32x32 */
1734     S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1735     S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1736     S32 i4_satd_32x32;
1737     //    S16 ai2_had_out[32*32];
1738     U08 *pu1_src;
1739     U08 *pu1_pred;
1740     S32 i;
1741 
1742     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1743     S32 *api4_satd_pu[HAD_32x32 + 1];
1744     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1745 
1746     U08 *pu1_inp = ps_prms->pu1_inp;
1747     U08 *pu1_ref = ps_prms->pu1_ref;
1748 
1749     S32 inp_stride = ps_prms->i4_inp_stride;
1750     S32 ref_stride = ps_prms->i4_ref_stride;
1751 
1752     //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[0];
1753     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1754     api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1755     api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1756 
1757     /* 32x32 SATD is calculates as the sum of the 4 8x8's in the block */
1758     for(i = 0; i < 16; i++)
1759     {
1760         pu1_src = pu1_inp + ((i & 0x3) << 3) + ((i >> 2) * inp_stride * 8);
1761 
1762         pu1_pred = pu1_ref + ((i & 0x3) << 3) + ((i >> 2) * ref_stride * 8);
1763 
1764         ai4_satd_8x8[i] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1765             pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1766     }
1767 
1768     /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1769     ai4_satd_16x16[0] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[4] + ai4_satd_8x8[5];
1770     ai4_satd_16x16[1] = ai4_satd_8x8[2] + ai4_satd_8x8[3] + ai4_satd_8x8[6] + ai4_satd_8x8[7];
1771     ai4_satd_16x16[2] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[12] + ai4_satd_8x8[13];
1772     ai4_satd_16x16[3] = ai4_satd_8x8[10] + ai4_satd_8x8[11] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1773 
1774     /* Update 32x32 SATD */
1775     pi4_sad_grid[PART_ID_2Nx2N] =
1776         ai4_satd_16x16[0] + ai4_satd_16x16[1] + ai4_satd_16x16[2] + ai4_satd_16x16[3];
1777 
1778     /* Update 16x16 SATDs */
1779     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_16x16[0];
1780     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_16x16[1];
1781     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_16x16[2];
1782     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_16x16[3];
1783 
1784     /* Update 16x32 / 32x16 SATDs */
1785     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_16x16[0] + ai4_satd_16x16[2];
1786     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_16x16[1] + ai4_satd_16x16[3];
1787     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_16x16[0] + ai4_satd_16x16[1];
1788     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_16x16[2] + ai4_satd_16x16[3];
1789 
1790     /* Update AMP SATDs 32x24,32x8, 24x32,8x32  */
1791     pi4_sad_grid[PART_ID_nLx2N_L] =
1792         ai4_satd_8x8[0] + ai4_satd_8x8[4] + ai4_satd_8x8[8] + ai4_satd_8x8[12];
1793 
1794     pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[5] + ai4_satd_8x8[9] +
1795                                     ai4_satd_8x8[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1796 
1797     pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_8x8[2] + ai4_satd_8x8[6] + ai4_satd_8x8[10] +
1798                                     ai4_satd_8x8[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1799 
1800     pi4_sad_grid[PART_ID_nRx2N_R] =
1801         ai4_satd_8x8[3] + ai4_satd_8x8[7] + ai4_satd_8x8[11] + ai4_satd_8x8[15];
1802 
1803     pi4_sad_grid[PART_ID_2NxnU_T] =
1804         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1805 
1806     pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_8x8[4] + ai4_satd_8x8[5] + ai4_satd_8x8[6] +
1807                                     ai4_satd_8x8[7] + pi4_sad_grid[PART_ID_2NxN_B];
1808 
1809     pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[10] +
1810                                     ai4_satd_8x8[11] + pi4_sad_grid[PART_ID_2NxN_T];
1811 
1812     pi4_sad_grid[PART_ID_2NxnD_B] =
1813         ai4_satd_8x8[12] + ai4_satd_8x8[13] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1814 }
1815 
hme_evalsatd_pt_pu_32x32_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)1816 WORD32 hme_evalsatd_pt_pu_32x32_tu_rec(
1817     err_prms_t *ps_prms,
1818     WORD32 lambda,
1819     WORD32 lambda_q_shift,
1820     WORD32 i4_frm_qstep,
1821     me_func_selector_t *ps_func_selector)
1822 {
1823     S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */
1824     S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1825     S32 ai4_tu_split_8x8[16];
1826     S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1827     S32 ai4_tu_split_16x16[4];
1828     S32 i4_satd_32x32;
1829 
1830     S32 ai4_tu_early_cbf_8x8[16];
1831     S32 ai4_tu_early_cbf_16x16[4];
1832     S32 early_cbf_flag;
1833 
1834     S16 *pi2_had_out;
1835 
1836     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1837     S32 *api4_satd_pu[HAD_32x32 + 1];
1838     S32 *api4_tu_split[HAD_32x32 + 1];
1839     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1840 
1841     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1842     S32 *pi4_tu_split_flag = ps_prms->pi4_tu_split_flags;
1843     S32 *pi4_tu_early_cbf = ps_prms->pi4_tu_early_cbf;
1844 
1845     S32 tu_split_flag = 0;
1846     S32 total_satd_cost = 0;
1847 
1848     U08 *pu1_inp = ps_prms->pu1_inp;
1849     U08 *pu1_ref = ps_prms->pu1_ref;
1850 
1851     S32 inp_stride = ps_prms->i4_inp_stride;
1852     S32 ref_stride = ps_prms->i4_ref_stride;
1853 
1854     /* Initialize tu_split_cost to "0" */
1855     ps_prms->i4_tu_split_cost = 0;
1856 
1857     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1858 
1859     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1860     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1861     api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1862     api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1863 
1864     api4_tu_split[HAD_4x4] = NULL;
1865     api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1866     api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
1867     api4_tu_split[HAD_32x32] = &tu_split_flag;
1868 
1869     api4_tu_early_cbf[HAD_4x4] = NULL;
1870     api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1871     api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
1872     api4_tu_early_cbf[HAD_32x32] = &early_cbf_flag;
1873 
1874     /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
1875     ihevce_had_32x32_r(
1876         pu1_inp,
1877         inp_stride,
1878         pu1_ref,
1879         ref_stride,
1880         pi2_had_out,
1881         32,
1882         api4_satd_pu,
1883         api4_tu_split,
1884         api4_tu_early_cbf,
1885         0,
1886         8,
1887         lambda,
1888         lambda_q_shift,
1889         i4_frm_qstep,
1890         0,
1891         ps_prms->u1_max_tr_depth,
1892         ps_prms->u1_max_tr_size,
1893         &(ps_prms->i4_tu_split_cost),
1894         ps_func_selector);
1895 
1896     total_satd_cost = i4_satd_32x32;
1897 
1898     /*The structure of the TU_SPLIT flag for the current 32x32 is as follows
1899     TL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1900     TR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1901     BL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1902     BR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1903     32x32_split - 1bit (LSBit)
1904 
1905     TU_SPLIT : (TL_16x16)_(TR_16x16)_(BL_16x16)_(BR_16x16)_32x32_split (21bits)*/
1906 
1907     pi4_sad_grid[PART_ID_2Nx2N] = total_satd_cost;
1908     pi4_tu_split_flag[PART_ID_2Nx2N] = tu_split_flag;
1909     pi4_tu_early_cbf[PART_ID_2Nx2N] = early_cbf_flag;
1910 
1911     return total_satd_cost;
1912 }
1913 
1914 /**
1915 ********************************************************************************
1916 *  @fn     S32 hme_evalsatd_pt_pu_64x64
1917 *
1918 *  @brief  Evaluates the SATD with partial updates for all the best partitions
1919 *          of a 64x64 CU based on accumulated Hadamard 32x32 and 16x16 satds
1920 *
1921 *           Note : 64x64 SATD does not do hadamard Transform using 32x32 hadamard
1922 *                  outputs but directly uses four 32x32 SATD and 16 16x16 SATDS as
1923 *                  TU size of 64 is not supported in HEVC
1924 *
1925 *  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1926 *                 pointer to sad grid of each partitions
1927 *
1928 *  @return     None
1929 ********************************************************************************
1930 */
1931 
hme_evalsatd_pt_pu_64x64(err_prms_t * ps_prms)1932 void hme_evalsatd_pt_pu_64x64(err_prms_t *ps_prms)
1933 {
1934     //S32 ai4_satd_4x4[4][64];   /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
1935     S32 ai4_satd_8x8[4][16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
1936     S32 ai4_satd_16x16[4][4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
1937     S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
1938     //    S16 ai2_had_out[32*32];
1939     S32 i, j;
1940 
1941     //  S32 ai4_tu_split_8x8[4][16];
1942     //  S32 ai4_tu_split_16x16[4][4];
1943     //  S32 ai4_tu_split_32x32[4];
1944 
1945     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1946     S32 *api4_satd_pu[HAD_32x32 + 1];
1947     //  S32 *api4_tu_split[HAD_32x32 + 1];
1948 
1949     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1950 
1951     U08 *pu1_inp = ps_prms->pu1_inp;
1952     U08 *pu1_ref = ps_prms->pu1_ref;
1953     U08 *pu1_src;
1954     U08 *pu1_pred;
1955 
1956     S32 inp_stride = ps_prms->i4_inp_stride;
1957     S32 ref_stride = ps_prms->i4_ref_stride;
1958 
1959     for(i = 0; i < 4; i++)
1960     {
1961         S32 blkx = (i & 0x1);
1962         S32 blky = (i >> 1);
1963         U08 *pu1_pi0, *pu1_pi1;
1964 
1965         //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[i][0];
1966         api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[i][0];
1967         api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[i][0];
1968         api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
1969 
1970         pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
1971         pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
1972 
1973         /* 64x64 SATD is calculates as the sum of the 4 16x16's in the block */
1974         for(j = 0; j < 16; j++)
1975         {
1976             pu1_src = pu1_pi0 + ((j & 0x3) << 3) + ((j >> 2) * inp_stride * 8);
1977 
1978             pu1_pred = pu1_pi1 + ((j & 0x3) << 3) + ((j >> 2) * ref_stride * 8);
1979 
1980             ai4_satd_8x8[i][j] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1981                 pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1982         }
1983 
1984         /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1985         ai4_satd_16x16[i][0] =
1986             ai4_satd_8x8[i][0] + ai4_satd_8x8[i][1] + ai4_satd_8x8[i][4] + ai4_satd_8x8[i][5];
1987         ai4_satd_16x16[i][1] =
1988             ai4_satd_8x8[i][2] + ai4_satd_8x8[i][3] + ai4_satd_8x8[i][6] + ai4_satd_8x8[i][7];
1989         ai4_satd_16x16[i][2] =
1990             ai4_satd_8x8[i][8] + ai4_satd_8x8[i][9] + ai4_satd_8x8[i][12] + ai4_satd_8x8[i][13];
1991         ai4_satd_16x16[i][3] =
1992             ai4_satd_8x8[i][10] + ai4_satd_8x8[i][11] + ai4_satd_8x8[i][14] + ai4_satd_8x8[i][15];
1993     }
1994 
1995     /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1996 
1997     ai4_satd_32x32[0] =
1998         ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3];
1999     ai4_satd_32x32[1] =
2000         ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3];
2001     ai4_satd_32x32[2] =
2002         ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3];
2003     ai4_satd_32x32[3] =
2004         ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2005 
2006     /* Update 64x64 SATDs */
2007     pi4_sad_grid[PART_ID_2Nx2N] =
2008         ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2009 
2010     /* Update 32x32 SATDs */
2011     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_32x32[0];
2012     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_32x32[1];
2013     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_32x32[2];
2014     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_32x32[3];
2015 
2016     /* Update 32x64 / 64x32 SATDs */
2017     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_32x32[0] + ai4_satd_32x32[2];
2018     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_32x32[1] + ai4_satd_32x32[3];
2019     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_32x32[0] + ai4_satd_32x32[1];
2020     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_32x32[2] + ai4_satd_32x32[3];
2021 
2022     /* Update AMP SATDs 64x48,64x16, 48x64,16x64  */
2023     pi4_sad_grid[PART_ID_nLx2N_L] =
2024         ai4_satd_16x16[0][0] + ai4_satd_16x16[0][2] + ai4_satd_16x16[2][0] + ai4_satd_16x16[2][2];
2025 
2026     pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_16x16[0][1] + ai4_satd_16x16[0][3] +
2027                                     ai4_satd_16x16[2][1] + ai4_satd_16x16[2][3] +
2028                                     pi4_sad_grid[PART_ID_Nx2N_R];
2029 
2030     pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][2] +
2031                                     ai4_satd_16x16[3][0] + ai4_satd_16x16[3][2] +
2032                                     pi4_sad_grid[PART_ID_Nx2N_L];
2033 
2034     pi4_sad_grid[PART_ID_nRx2N_R] =
2035         ai4_satd_16x16[1][1] + ai4_satd_16x16[1][3] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][3];
2036 
2037     pi4_sad_grid[PART_ID_2NxnU_T] =
2038         ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1];
2039 
2040     pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3] +
2041                                     ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3] +
2042                                     pi4_sad_grid[PART_ID_2NxN_B];
2043 
2044     pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] +
2045                                     ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] +
2046                                     pi4_sad_grid[PART_ID_2NxN_T];
2047 
2048     pi4_sad_grid[PART_ID_2NxnD_B] =
2049         ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2050 }
2051 
hme_evalsatd_pt_pu_64x64_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)2052 WORD32 hme_evalsatd_pt_pu_64x64_tu_rec(
2053     err_prms_t *ps_prms,
2054     WORD32 lambda,
2055     WORD32 lambda_q_shift,
2056     WORD32 i4_frm_qstep,
2057     me_func_selector_t *ps_func_selector)
2058 {
2059     S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
2060     S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
2061     S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
2062     S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
2063 
2064     S32 ai4_tu_split_8x8[16];
2065     S32 ai4_tu_split_16x16[4];
2066 
2067     S32 ai4_tu_early_cbf_8x8[16];
2068     S32 ai4_tu_early_cbf_16x16[4];
2069 
2070     S16 *pi2_had_out;
2071     S32 i;
2072 
2073     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
2074     S32 *api4_satd_pu[HAD_32x32 + 1];
2075     S32 *api4_tu_split[HAD_32x32 + 1];
2076     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
2077 
2078     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
2079 
2080     S32 tu_split_flag = 0;
2081     S32 total_satd_cost = 0;
2082 
2083     U08 *pu1_inp = ps_prms->pu1_inp;
2084     U08 *pu1_ref = ps_prms->pu1_ref;
2085 
2086     S32 inp_stride = ps_prms->i4_inp_stride;
2087     S32 ref_stride = ps_prms->i4_ref_stride;
2088 
2089     /* Initialize tu_split_cost to "0" */
2090     ps_prms->i4_tu_split_cost = 0;
2091 
2092     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
2093 
2094     for(i = 0; i < 4; i++)
2095     {
2096         S32 blkx = (i & 0x1);
2097         S32 blky = (i >> 1);
2098         U08 *pu1_pi0, *pu1_pi1;
2099         tu_split_flag = 0;
2100 
2101         api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
2102         api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
2103         api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
2104         api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
2105 
2106         api4_tu_split[HAD_4x4] = NULL;
2107         api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
2108         api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
2109         api4_tu_split[HAD_32x32] = &ps_prms->pi4_tu_split_flags[i];
2110 
2111         api4_tu_early_cbf[HAD_4x4] = NULL;
2112         api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
2113         api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
2114         api4_tu_early_cbf[HAD_32x32] = &ps_prms->pi4_tu_early_cbf[i];
2115 
2116         pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
2117         pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
2118 
2119         /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
2120         ihevce_had_32x32_r(
2121             pu1_pi0,
2122             inp_stride,
2123             pu1_pi1,
2124             ref_stride,
2125             pi2_had_out,
2126             32,
2127             api4_satd_pu,
2128             api4_tu_split,
2129             api4_tu_early_cbf,
2130             0,
2131             8,
2132             lambda,
2133             lambda_q_shift,
2134             i4_frm_qstep,
2135             1,
2136             ps_prms->u1_max_tr_depth,
2137             ps_prms->u1_max_tr_size,
2138             &(ps_prms->i4_tu_split_cost),
2139             ps_func_selector);
2140     }
2141 
2142     total_satd_cost = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2143 
2144     /* Update 64x64 SATDs */
2145     pi4_sad_grid[PART_ID_2Nx2N] =
2146         ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2147 
2148     return total_satd_cost;
2149 }
2150 
2151 /**
2152 ********************************************************************************
2153 *  @fn     void hme_subpel_refine_search_node(search_node_t *ps_search_node,
2154 *                                   hme_subpel_prms_t *ps_prms,
2155 *                                   layer_ctxt_t *ps_curr_layer,
2156 *                                   BLK_SIZE_T e_blk_size,
2157 *                                   S32 x_off,
2158 *                                   S32 y_off)
2159 *
2160 *  @brief  Refines a given partition within a CU
2161 *
2162 *  @param[in,out]  ps_search_node: supplies starting mv and also ref id.
2163 *                   updated with the accurate subpel mv
2164 *
2165 *  @param[in]  ps_prms: subpel prms input to this function
2166 *
2167 *  @param[in]  ps_curr_layer : layer context
2168 *
2169 *  @param[in]  e_blk_size : Block size enumeration
2170 *
2171 *  @param[in]  x_off : x offset of the partition w.r.t. pic start
2172 *
2173 *  @param[in]  y_off : y offset of the partition w.r.t. pic start
2174 *
2175 *  @return None
2176 ********************************************************************************
2177 */
2178 
hme_get_calc_sad_and_result_subpel_fxn(me_func_selector_t * ps_func_selector,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list,S32 i4_part_mask,U08 u1_use_satd,U08 u1_num_parts,U08 u1_num_results)2179 static __inline PF_SAD_RESULT_FXN_T hme_get_calc_sad_and_result_subpel_fxn(
2180     me_func_selector_t *ps_func_selector,
2181     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list,
2182     S32 i4_part_mask,
2183     U08 u1_use_satd,
2184     U08 u1_num_parts,
2185     U08 u1_num_results)
2186 {
2187     PF_SAD_RESULT_FXN_T pf_err_compute;
2188 
2189     ASSERT((1 == u1_num_results) || (2 == u1_num_results));
2190 
2191     if(1 == u1_num_results)
2192     {
2193         if(u1_use_satd)
2194         {
2195             if(u1_num_parts == 1)
2196             {
2197                 pf_err_compute =
2198                     ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_eq_1;
2199             }
2200             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2201             {
2202                 pf_err_compute =
2203                     ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_9;
2204             }
2205             else
2206             {
2207                 pf_err_compute =
2208                     ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_17;
2209             }
2210         }
2211         else
2212         {
2213             if(u1_num_parts == 1)
2214             {
2215                 pf_err_compute = ps_me_optimised_function_list
2216                                      ->pf_calc_sad_and_1_best_result_subpel_num_part_eq_1;
2217             }
2218             else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2219             {
2220                 pf_err_compute =
2221                     ps_me_optimised_function_list->pf_calc_sad_and_1_best_result_subpel_square_parts;
2222             }
2223             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2224             {
2225                 pf_err_compute = ps_me_optimised_function_list
2226                                      ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_9;
2227             }
2228             else
2229             {
2230                 pf_err_compute = ps_me_optimised_function_list
2231                                      ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_17;
2232             }
2233         }
2234     }
2235     else
2236     {
2237         if(u1_use_satd)
2238         {
2239             if(u1_num_parts == 1)
2240             {
2241                 pf_err_compute =
2242                     ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_eq_1;
2243             }
2244             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2245             {
2246                 pf_err_compute =
2247                     ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_9;
2248             }
2249             else
2250             {
2251                 pf_err_compute =
2252                     ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_17;
2253             }
2254         }
2255         else
2256         {
2257             if(u1_num_parts == 1)
2258             {
2259                 pf_err_compute = ps_me_optimised_function_list
2260                                      ->pf_calc_sad_and_2_best_results_subpel_num_part_eq_1;
2261             }
2262             else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2263             {
2264                 pf_err_compute = ps_me_optimised_function_list
2265                                      ->pf_calc_sad_and_2_best_results_subpel_square_parts;
2266             }
2267             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2268             {
2269                 pf_err_compute = ps_me_optimised_function_list
2270                                      ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_9;
2271             }
2272             else
2273             {
2274                 pf_err_compute = ps_me_optimised_function_list
2275                                      ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_17;
2276             }
2277         }
2278     }
2279 
2280     return pf_err_compute;
2281 }
2282 
2283 #if DIAMOND_GRID == 1
hme_subpel_refine_search_node_high_speed(search_node_t * ps_search_node,hme_subpel_prms_t * ps_prms,layer_ctxt_t * ps_curr_layer,BLK_SIZE_T e_blk_size,S32 x_off,S32 y_off,search_results_t * ps_search_results,S32 pred_lx,S32 i4_part_mask,S32 * pi4_valid_part_ids,S32 search_idx,subpel_dedup_enabler_t * ps_dedup_enabler,me_func_selector_t * ps_func_selector,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)2284 S32 hme_subpel_refine_search_node_high_speed(
2285     search_node_t *ps_search_node,
2286     hme_subpel_prms_t *ps_prms,
2287     layer_ctxt_t *ps_curr_layer,
2288     BLK_SIZE_T e_blk_size,
2289     S32 x_off,
2290     S32 y_off,
2291     search_results_t *ps_search_results,
2292     S32 pred_lx,
2293     S32 i4_part_mask,
2294     S32 *pi4_valid_part_ids,
2295     S32 search_idx,
2296     subpel_dedup_enabler_t *ps_dedup_enabler,
2297     me_func_selector_t *ps_func_selector,
2298     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
2299 {
2300     S32 i4_num_hpel_refine, i4_num_qpel_refine;
2301     S32 i4_offset, i4_grid_mask;
2302     S08 i1_ref_idx;
2303     S32 i4_blk_wd, i4_blk_ht;
2304     S32 i4_ref_stride, i4_i;
2305     pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2306     result_upd_prms_t s_result_prms;
2307     search_node_t s_temp_search_node;
2308 
2309     /*************************************************************************/
2310     /* Tracks current MV with the fractional component.                      */
2311     /*************************************************************************/
2312     S32 i4_mv_x, i4_mv_y;
2313     S32 i4_frac_x, i4_frac_y;
2314 
2315     /*************************************************************************/
2316     /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2317     /* This function                                                         */
2318     /*************************************************************************/
2319     PF_SAD_RESULT_FXN_T pf_err_compute;
2320 
2321     S32 ai4_sad_grid[17], i4_tot_cost;
2322     err_prms_t s_err_prms;
2323 
2324     /*************************************************************************/
2325     /* Allowed MV RANGE                                                      */
2326     /*************************************************************************/
2327     range_prms_t *ps_range_prms;
2328 
2329     /*************************************************************************/
2330     /* stores min id in grid with associated min cost.                       */
2331     /*************************************************************************/
2332     S32 i4_min_cost, i4_min_sad;
2333     GRID_PT_T e_min_id;
2334 
2335     PF_INTERP_FXN_T pf_qpel_interp;
2336     /*************************************************************************/
2337     /* For hpel and qpel we move in diamonds and hence each point in the     */
2338     /* diamond will belong to a completely different plane. To simplify the  */
2339     /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2340     /* hpel planes which are interpolated during recon.                      */
2341     /*************************************************************************/
2342     U08 *apu1_hpel_ref[4], *pu1_ref;
2343 
2344     interp_prms_t s_interp_prms;
2345 
2346     /*************************************************************************/
2347     /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2348     /* points to the corresponding predicted buf with its stride.            */
2349     /* Note that the pointer cannot be derived just from the id, since the   */
2350     /* pointer may also point to the hpel buffer (in case we request interp  */
2351     /* of a hpel pt, which already exists in the recon hpel planes)          */
2352     /*************************************************************************/
2353     U08 *pu1_final_out;
2354     S32 i4_final_out_stride;
2355     S32 part_id;
2356     S32 check_for_duplicate = 0;
2357 
2358     subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
2359 
2360     S32 mvx_qpel;
2361     S32 mvy_qpel;
2362 
2363     pf_err_compute = hme_get_calc_sad_and_result_subpel_fxn(
2364         ps_func_selector,
2365         ps_me_optimised_function_list,
2366         i4_part_mask,
2367         ps_prms->i4_use_satd,
2368         ps_subpel_refine_ctxt->i4_num_valid_parts,
2369         ps_search_results->u1_num_results_per_part);
2370 
2371     i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2372     i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2373 
2374     /* Prediction contet should now deal with qpel units */
2375     HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2376 
2377     /* Buffer allocation for subpel */
2378     /* Current design is that there may be many partitions and different mvs */
2379     /* that attempt subpel refinemnt. While there is possibility of overlap, the */
2380     /* hashing to detect and avoid overlap may be very complex. So, currently,   */
2381     /* the only thing done is to store the eventual predicted buffer with every  */
2382     /* ctb node that holds the result of hte best subpel search */
2383 
2384     /* Compute the base pointer for input, interpolated buffers */
2385     /* The base pointers point as follows: */
2386     /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
2387     /* To these, we need to add the offset of the current node */
2388     i4_ref_stride = ps_curr_layer->i4_rec_stride;
2389     i4_offset = x_off + (y_off * i4_ref_stride);
2390     i1_ref_idx = ps_search_node->i1_ref_idx;
2391 
2392     apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
2393     apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
2394     apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
2395     apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
2396 
2397     /* Initialize result params used for partition update */
2398     s_result_prms.pf_mv_cost_compute = NULL;
2399     s_result_prms.ps_search_results = ps_search_results;
2400     s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
2401     s_result_prms.i1_ref_idx = ps_search_node->i1_ref_idx;
2402     s_result_prms.u1_pred_lx = search_idx;
2403     s_result_prms.i4_part_mask = i4_part_mask;
2404     s_result_prms.ps_search_node_base = ps_search_node;
2405     s_result_prms.pi4_sad_grid = &ai4_sad_grid[0];
2406     s_result_prms.i4_grid_mask = 1;
2407     s_result_prms.ps_search_node = &s_temp_search_node;
2408     s_temp_search_node.i1_ref_idx = ps_search_node->i1_ref_idx;
2409 
2410     /* convert to hpel units */
2411     i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
2412     i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
2413 
2414     /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
2415     ps_range_prms = ps_prms->aps_mv_range_qpel[i1_ref_idx];
2416     i4_grid_mask = (GRID_DIAMOND_ENABLE_ALL);
2417     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2418 
2419     i4_min_cost = MAX_32BIT_VAL;
2420     i4_min_sad = MAX_32BIT_VAL;
2421 
2422     /*************************************************************************/
2423     /* Prepare the input params to SAD/SATD function. Note that input is     */
2424     /* passed from the calling funcion since it may be I (normal subpel      */
2425     /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
2426     /* Both cases are handled here.                                          */
2427     /*************************************************************************/
2428     s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
2429     s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
2430     s_err_prms.i4_ref_stride = i4_ref_stride;
2431     s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
2432     s_err_prms.i4_grid_mask = 1;
2433     s_err_prms.pi4_sad_grid = &ai4_sad_grid[0];
2434     s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
2435     s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
2436 
2437     s_result_prms.ps_subpel_refine_ctxt = ps_subpel_refine_ctxt;
2438 
2439     part_id = ps_search_node->u1_part_id;
2440     for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
2441     {
2442         e_min_id = PT_C;
2443 
2444         mvx_qpel = i4_mv_x << 1;
2445         mvy_qpel = i4_mv_y << 1;
2446 
2447         /* Central pt */
2448         if(i4_grid_mask & BIT_EN(PT_C))
2449         {
2450             //ps_search_node->i2_mv_x = (S16)i4_mv_x;
2451             //ps_search_node->i2_mv_x = (S16)i4_mv_y;
2452             /* central pt is i4_mv_x, i4_mv_y */
2453             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2454                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
2455 
2456             i4_frac_x = i4_mv_x & 1;
2457             i4_frac_y = i4_mv_y & 1;
2458             pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2459             s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2460 
2461             /* Update the mv's with the current candt motion vectors */
2462             s_result_prms.i2_mv_x = mvx_qpel;
2463             s_result_prms.i2_mv_y = mvy_qpel;
2464             s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2465             s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2466 
2467             pf_err_compute(&s_err_prms, &s_result_prms);
2468 
2469             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2470             if(i4_tot_cost < i4_min_cost)
2471             {
2472                 i4_min_cost = i4_tot_cost;
2473                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2474                 e_min_id = PT_C;
2475                 pu1_final_out = s_err_prms.pu1_ref;
2476             }
2477         }
2478 
2479         /* left pt */
2480         if(i4_grid_mask & BIT_EN(PT_L))
2481         {
2482             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2483                 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
2484 
2485             if(!check_for_duplicate)
2486             {
2487                 /* search node mv is stored in qpel units */
2488                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
2489                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2490                 /* central pt is i4_mv_x - 1, i4_mv_y */
2491                 i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
2492                 i4_frac_y = i4_mv_y & 1;
2493                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2494                 s_err_prms.pu1_ref =
2495                     pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2496 
2497                 /* Update the mv's with the current candt motion vectors */
2498                 s_result_prms.i2_mv_x = mvx_qpel - 2;
2499                 s_result_prms.i2_mv_y = mvy_qpel;
2500                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 2;
2501                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2502 
2503                 pf_err_compute(&s_err_prms, &s_result_prms);
2504                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2505                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2506                 if(i4_tot_cost < i4_min_cost)
2507                 {
2508                     i4_min_cost = i4_tot_cost;
2509                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2510                     e_min_id = PT_L;
2511                     pu1_final_out = s_err_prms.pu1_ref;
2512                 }
2513             }
2514         }
2515         /* top pt */
2516         if(i4_grid_mask & BIT_EN(PT_T))
2517         {
2518             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2519                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
2520 
2521             if(!check_for_duplicate)
2522             {
2523                 /* search node mv is stored in qpel units */
2524                 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
2525                 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
2526                 /* top pt is i4_mv_x, i4_mv_y - 1 */
2527                 i4_frac_x = i4_mv_x & 1;
2528                 i4_frac_y = (i4_mv_y - 1) & 1;
2529                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2530                 s_err_prms.pu1_ref =
2531                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
2532 
2533                 /* Update the mv's with the current candt motion vectors */
2534                 s_result_prms.i2_mv_x = mvx_qpel;
2535                 s_result_prms.i2_mv_y = mvy_qpel - 2;
2536                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2537                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 2;
2538 
2539                 pf_err_compute(&s_err_prms, &s_result_prms);
2540                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2541                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2542                 if(i4_tot_cost < i4_min_cost)
2543                 {
2544                     i4_min_cost = i4_tot_cost;
2545                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2546                     e_min_id = PT_T;
2547                     pu1_final_out = s_err_prms.pu1_ref;
2548                 }
2549             }
2550         }
2551         /* right pt */
2552         if(i4_grid_mask & BIT_EN(PT_R))
2553         {
2554             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2555                 ps_dedup_enabler, num_unique_nodes, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
2556             if(!check_for_duplicate)
2557             {
2558                 /* search node mv is stored in qpel units */
2559                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
2560                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2561                 /* right pt is i4_mv_x + 1, i4_mv_y */
2562                 i4_frac_x = (i4_mv_x + 1) & 1;
2563                 i4_frac_y = i4_mv_y & 1;
2564 
2565                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2566                 s_err_prms.pu1_ref =
2567                     pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2568 
2569                 /* Update the mv's with the current candt motion vectors */
2570                 s_result_prms.i2_mv_x = mvx_qpel + 2;
2571                 s_result_prms.i2_mv_y = mvy_qpel;
2572                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 2;
2573                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2574 
2575                 pf_err_compute(&s_err_prms, &s_result_prms);
2576                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2577                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2578                 if(i4_tot_cost < i4_min_cost)
2579                 {
2580                     i4_min_cost = i4_tot_cost;
2581                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2582                     e_min_id = PT_R;
2583                     pu1_final_out = s_err_prms.pu1_ref;
2584                 }
2585             }
2586         }
2587         /* bottom pt */
2588         if(i4_grid_mask & BIT_EN(PT_B))
2589         {
2590             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2591                 ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
2592             if(!check_for_duplicate)
2593             {
2594                 /* search node mv is stored in qpel units */
2595                 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
2596                 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
2597                 i4_frac_x = i4_mv_x & 1;
2598                 i4_frac_y = (i4_mv_y + 1) & 1;
2599                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2600                 s_err_prms.pu1_ref =
2601                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
2602 
2603                 /* Update the mv's with the current candt motion vectors */
2604                 s_result_prms.i2_mv_x = mvx_qpel;
2605                 s_result_prms.i2_mv_y = mvy_qpel + 2;
2606                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2607                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 2;
2608 
2609                 pf_err_compute(&s_err_prms, &s_result_prms);
2610                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2611                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2612                 if(i4_tot_cost < i4_min_cost)
2613                 {
2614                     i4_min_cost = i4_tot_cost;
2615                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2616                     e_min_id = PT_B;
2617                     pu1_final_out = s_err_prms.pu1_ref;
2618                 }
2619             }
2620         }
2621         /* Early exit in case of central point */
2622         if(e_min_id == PT_C)
2623             break;
2624 
2625         /*********************************************************************/
2626         /* Depending on the best result location, we may be able to skip     */
2627         /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
2628         /* the best result, the next iteration need not do centre, left pts  */
2629         /*********************************************************************/
2630         i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2631         i4_mv_x += gai1_grid_id_to_x[e_min_id];
2632         i4_mv_y += gai1_grid_id_to_y[e_min_id];
2633         ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2634         ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2635         i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2636     }
2637 
2638     /* Convert to QPEL units */
2639     i4_mv_x <<= 1;
2640     i4_mv_y <<= 1;
2641 
2642     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2643     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2644 
2645     /* Exact interpolation or averaging chosen here */
2646     pf_qpel_interp = ps_prms->pf_qpel_interp;
2647 
2648     /* Next QPEL ME */
2649     /* In this case, we have option of doing exact QPEL interpolation or avg */
2650     /*************************************************************************/
2651     /*        x                                                              */
2652     /*    A b C d                                                            */
2653     /*    e f g h                                                            */
2654     /*    I j K l                                                            */
2655     /*    m n o p                                                            */
2656     /*    Q r S t                                                            */
2657     /*                                                                       */
2658     /*    Approximate QPEL logic                                             */
2659     /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
2660     /*    for any given pt, we can get all the information required about    */
2661     /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
2662     /*     surrounding pts info:                                             */
2663     /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
2664     /*           buffer 2: hxfy, offsets for both are 0, 0                   */
2665     /*    similarly for other pts the info can be gotten                     */
2666     /*************************************************************************/
2667     i4_grid_mask = GRID_DIAMOND_ENABLE_ALL ^ (BIT_EN(PT_C));
2668     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2669 
2670     /*************************************************************************/
2671     /* One time preparation of non changing interpolation params. These      */
2672     /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
2673     /* working memory (not used though in case of averaging).                */
2674     /*************************************************************************/
2675     s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
2676     s_interp_prms.i4_ref_stride = i4_ref_stride;
2677     s_interp_prms.i4_blk_wd = i4_blk_wd;
2678     s_interp_prms.i4_blk_ht = i4_blk_ht;
2679 
2680     i4_final_out_stride = i4_ref_stride;
2681 
2682     {
2683         U08 *pu1_mem;
2684         /*********************************************************************/
2685         /* Allocation of working memory for interpolated buffers. We maintain*/
2686         /* an intermediate working buffer, and 2 ping pong interpolated out  */
2687         /* buffers, purpose of ping pong explained later below               */
2688         /*********************************************************************/
2689         pu1_mem = ps_prms->pu1_wkg_mem;
2690         s_interp_prms.pu1_wkg_mem = pu1_mem;
2691 
2692         //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
2693         s_interp_prms.apu1_interp_out[0] = pu1_mem;
2694 
2695         pu1_mem += (INTERP_OUT_BUF_SIZE);
2696         s_interp_prms.apu1_interp_out[1] = pu1_mem;
2697 
2698         pu1_mem += (INTERP_OUT_BUF_SIZE);
2699         s_interp_prms.apu1_interp_out[2] = pu1_mem;
2700 
2701         pu1_mem += (INTERP_OUT_BUF_SIZE);
2702         s_interp_prms.apu1_interp_out[3] = pu1_mem;
2703 
2704         pu1_mem += (INTERP_OUT_BUF_SIZE);
2705         s_interp_prms.apu1_interp_out[4] = pu1_mem;
2706 
2707         /*********************************************************************/
2708         /* Stride of interpolated output is just a function of blk width of  */
2709         /* this partition and hence remains constant for this partition      */
2710         /*********************************************************************/
2711         s_interp_prms.i4_out_stride = (i4_blk_wd);
2712     }
2713 
2714     {
2715         UWORD8 *apu1_final[4];
2716         WORD32 ai4_ref_stride[4];
2717         /*************************************************************************/
2718         /* Ping pong design for interpolated buffers. We use a min id, which     */
2719         /* tracks the id of the ppu1_interp_out that stores the best result.     */
2720         /* When new interp to be done, it uses 1 - bes result id to do the interp*/
2721         /* min id is toggled when any new result becomes the best result.        */
2722         /*************************************************************************/
2723 
2724         for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
2725         {
2726             e_min_id = PT_C;
2727 
2728             mvx_qpel = i4_mv_x;
2729             mvy_qpel = i4_mv_y;
2730             hme_qpel_interp_comprehensive(
2731                 &s_interp_prms,
2732                 apu1_final,
2733                 ai4_ref_stride,
2734                 i4_mv_x,
2735                 i4_mv_y,
2736                 i4_grid_mask,
2737                 ps_me_optimised_function_list);
2738             if(i4_grid_mask & BIT_EN(PT_L))
2739             {
2740                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2741                     ps_dedup_enabler,
2742                     num_unique_nodes,
2743                     mvx_qpel - 1,
2744                     mvy_qpel - 0,
2745                     check_for_duplicate);
2746 
2747                 if(!check_for_duplicate)
2748                 {
2749                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
2750                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2751 
2752                     s_err_prms.pu1_ref = apu1_final[0];
2753                     s_err_prms.i4_ref_stride = ai4_ref_stride[0];
2754 
2755                     /* Update the mv's with the current candt motion vectors */
2756                     s_result_prms.i2_mv_x = mvx_qpel - 1;
2757                     s_result_prms.i2_mv_y = mvy_qpel;
2758                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 1;
2759                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2760 
2761                     pf_err_compute(&s_err_prms, &s_result_prms);
2762                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2763 
2764                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2765                     if(i4_tot_cost < i4_min_cost)
2766                     {
2767                         e_min_id = PT_L;
2768                         i4_min_cost = i4_tot_cost;
2769                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2770                     }
2771                 }
2772             }
2773             if(i4_grid_mask & BIT_EN(PT_T))
2774             {
2775                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2776                     ps_dedup_enabler,
2777                     num_unique_nodes,
2778                     mvx_qpel - 0,
2779                     mvy_qpel - 1,
2780                     check_for_duplicate);
2781 
2782                 if(!check_for_duplicate)
2783                 {
2784                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2785                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
2786 
2787                     s_err_prms.pu1_ref = apu1_final[1];
2788                     s_err_prms.i4_ref_stride = ai4_ref_stride[1];
2789 
2790                     /* Update the mv's with the current candt motion vectors */
2791                     s_result_prms.i2_mv_x = mvx_qpel;
2792                     s_result_prms.i2_mv_y = mvy_qpel - 1;
2793 
2794                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2795                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 1;
2796 
2797                     pf_err_compute(&s_err_prms, &s_result_prms);
2798 
2799                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2800                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2801                     if(i4_tot_cost < i4_min_cost)
2802                     {
2803                         e_min_id = PT_T;
2804                         i4_min_cost = i4_tot_cost;
2805                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2806                     }
2807                 }
2808             }
2809             if(i4_grid_mask & BIT_EN(PT_R))
2810             {
2811                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2812                     ps_dedup_enabler, num_unique_nodes, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
2813 
2814                 if(!check_for_duplicate)
2815                 {
2816                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
2817                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2818 
2819                     s_err_prms.pu1_ref = apu1_final[2];
2820                     s_err_prms.i4_ref_stride = ai4_ref_stride[2];
2821 
2822                     /* Update the mv's with the current candt motion vectors */
2823                     s_result_prms.i2_mv_x = mvx_qpel + 1;
2824                     s_result_prms.i2_mv_y = mvy_qpel;
2825 
2826                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 1;
2827                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2828 
2829                     pf_err_compute(&s_err_prms, &s_result_prms);
2830 
2831                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2832 
2833                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2834                     if(i4_tot_cost < i4_min_cost)
2835                     {
2836                         e_min_id = PT_R;
2837                         i4_min_cost = i4_tot_cost;
2838                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2839                     }
2840                 }
2841             }
2842             /* i4_mv_x and i4_mv_y will always be the centre pt */
2843             /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
2844             if(i4_grid_mask & BIT_EN(PT_B))
2845             {
2846                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2847                     ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
2848 
2849                 if(!check_for_duplicate)
2850                 {
2851                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2852                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
2853 
2854                     s_err_prms.pu1_ref = apu1_final[3];
2855                     s_err_prms.i4_ref_stride = ai4_ref_stride[3];
2856 
2857                     /* Update the mv's with the current candt motion vectors */
2858                     s_result_prms.i2_mv_x = mvx_qpel;
2859                     s_result_prms.i2_mv_y = mvy_qpel + 1;
2860 
2861                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2862                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 1;
2863 
2864                     pf_err_compute(&s_err_prms, &s_result_prms);
2865 
2866                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2867                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2868                     if(i4_tot_cost < i4_min_cost)
2869                     {
2870                         e_min_id = PT_B;
2871                         i4_min_cost = i4_tot_cost;
2872                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2873                     }
2874                 }
2875             }
2876 
2877             /* New QPEL mv x and y */
2878             if(e_min_id == PT_C)
2879                 break;
2880             i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2881             i4_mv_x += gai1_grid_id_to_x[e_min_id];
2882             i4_mv_y += gai1_grid_id_to_y[e_min_id];
2883             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2884             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2885             i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2886         }
2887     }
2888 
2889     /* update modified motion vectors and cost at end of subpel */
2890     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2891     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2892     ps_search_node->i4_tot_cost = i4_min_cost;
2893     ps_search_node->i4_sad = i4_min_sad;
2894 
2895     /********************************************************************************/
2896     /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
2897     /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
2898     /********************************************************************************/
2899     //ps_pred_ctxt->lambda >>= 1;
2900 
2901     return (i4_min_cost);
2902 }
2903 #elif DIAMOND_GRID == 0
hme_subpel_refine_search_node_high_speed(search_node_t * ps_search_node,hme_subpel_prms_t * ps_prms,layer_ctxt_t * ps_curr_layer,BLK_SIZE_T e_blk_size,S32 x_off,S32 y_off,search_results_t * ps_search_results,S32 pred_lx,S32 i4_part_mask,S32 * pi4_valid_part_ids,S32 search_idx,subpel_dedup_enabler_t * ps_dedup_enabler,me_func_selector_t * ps_func_selector)2904 S32 hme_subpel_refine_search_node_high_speed(
2905     search_node_t *ps_search_node,
2906     hme_subpel_prms_t *ps_prms,
2907     layer_ctxt_t *ps_curr_layer,
2908     BLK_SIZE_T e_blk_size,
2909     S32 x_off,
2910     S32 y_off,
2911     search_results_t *ps_search_results,
2912     S32 pred_lx,
2913     S32 i4_part_mask,
2914     S32 *pi4_valid_part_ids,
2915     S32 search_idx,
2916     subpel_dedup_enabler_t *ps_dedup_enabler,
2917     me_func_selector_t *ps_func_selector)
2918 {
2919     S32 i4_num_hpel_refine, i4_num_qpel_refine;
2920     S32 i4_offset, i4_grid_mask;
2921     S08 i1_ref_idx;
2922     S32 i4_blk_wd, i4_blk_ht;
2923     S32 i4_ref_stride, i4_i;
2924     pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2925     result_upd_prms_t s_result_prms;
2926 
2927     /*************************************************************************/
2928     /* Tracks current MV with the fractional component.                      */
2929     /*************************************************************************/
2930     S32 i4_mv_x, i4_mv_y;
2931     S32 i4_frac_x, i4_frac_y;
2932 
2933     /*************************************************************************/
2934     /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2935     /* This function                                                         */
2936     /*************************************************************************/
2937     PF_SAD_FXN_T pf_err_compute;
2938     S32 ai4_sad_grid[9][17], i4_tot_cost;
2939     err_prms_t s_err_prms;
2940 
2941     /*************************************************************************/
2942     /* Allowed MV RANGE                                                      */
2943     /*************************************************************************/
2944     range_prms_t *ps_range_prms;
2945 
2946     /*************************************************************************/
2947     /* stores min id in grid with associated min cost.                       */
2948     /*************************************************************************/
2949     S32 i4_min_cost, i4_min_sad;
2950     GRID_PT_T e_min_id;
2951 
2952     PF_INTERP_FXN_T pf_qpel_interp;
2953     /*************************************************************************/
2954     /* For hpel and qpel we move in diamonds and hence each point in the     */
2955     /* diamond will belong to a completely different plane. To simplify the  */
2956     /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2957     /* hpel planes which are interpolated during recon.                      */
2958     /*************************************************************************/
2959     U08 *apu1_hpel_ref[4], *pu1_ref;
2960 
2961     interp_prms_t s_interp_prms;
2962 
2963     /*************************************************************************/
2964     /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2965     /* points to the corresponding predicted buf with its stride.            */
2966     /* Note that the pointer cannot be derived just from the id, since the   */
2967     /* pointer may also point to the hpel buffer (in case we request interp  */
2968     /* of a hpel pt, which already exists in the recon hpel planes)          */
2969     /*************************************************************************/
2970     U08 *pu1_final_out;
2971     S32 i4_final_out_stride;
2972     S32 part_id;
2973     S32 check_for_duplicate = 0;
2974 
2975     S32 mvx_qpel;
2976     S32 mvy_qpel;
2977 
2978     /*************************************************************************/
2979     /* Appropriate Err compute fxn, depends on SAD/SATD, blk size and remains*/
2980     /* fixed through this subpel refinement for this partition.              */
2981     /* Note, we do not enable grid sads since each pt is different buffers.  */
2982     /* Hence, part mask is also nearly dont care and we use 2Nx2N enabled.   */
2983     /*************************************************************************/
2984     if(ps_prms->i4_use_satd)
2985     {
2986         pf_err_compute = hme_evalsatd_update_1_best_result_pt_pu_16x16;
2987     }
2988     else
2989     {
2990         pf_err_compute = hme_evalsad_grid_pu_16x16; /* hme_evalsad_pt_pu_16x16; */
2991     }
2992 
2993     i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2994     i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2995 
2996     /* Prediction contet should now deal with qpel units */
2997     HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2998 
2999     /* Buffer allocation for subpel */
3000     /* Current design is that there may be many partitions and different mvs */
3001     /* that attempt subpel refinemnt. While there is possibility of overlap, the */
3002     /* hashing to detect and avoid overlap may be very complex. So, currently,   */
3003     /* the only thing done is to store the eventual predicted buffer with every  */
3004     /* ctb node that holds the result of hte best subpel search */
3005 
3006     /* Compute the base pointer for input, interpolated buffers */
3007     /* The base pointers point as follows:
3008     /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
3009     /* To these, we need to add the offset of the current node */
3010     i4_ref_stride = ps_curr_layer->i4_rec_stride;
3011     i4_offset = x_off + (y_off * i4_ref_stride);
3012     i1_ref_idx = ps_search_node->i1_ref_idx;
3013 
3014     apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
3015     apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
3016     apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
3017     apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
3018 
3019     /* Initialize result params used for partition update */
3020     s_result_prms.pf_mv_cost_compute = NULL;
3021     s_result_prms.ps_search_results = ps_search_results;
3022     s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
3023     s_result_prms.i1_ref_idx = search_idx;
3024     s_result_prms.i4_part_mask = i4_part_mask;
3025     s_result_prms.ps_search_node_base = ps_search_node;
3026     s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3027     s_result_prms.i4_grid_mask = 1;
3028 
3029     /* convert to hpel units */
3030     i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
3031     i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
3032 
3033     /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
3034     ps_range_prms = ps_prms->ps_mv_range_qpel;
3035     i4_grid_mask = (GRID_ALL_PTS_VALID);
3036     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3037 
3038     i4_min_cost = MAX_32BIT_VAL;
3039     i4_min_sad = MAX_32BIT_VAL;
3040 
3041     /*************************************************************************/
3042     /* Prepare the input params to SAD/SATD function. Note that input is     */
3043     /* passed from the calling funcion since it may be I (normal subpel      */
3044     /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
3045     /* Both cases are handled here.                                          */
3046     /*************************************************************************/
3047     s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
3048     s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
3049     s_err_prms.i4_ref_stride = i4_ref_stride;
3050     s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
3051     s_err_prms.i4_grid_mask = 1;
3052     s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3053     s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
3054     s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
3055 
3056     /* TODO: Currently doubling lambda for Hadamard Sad instead of 1.9*sadlambda */
3057     //ps_pred_ctxt->lambda <<= 1;
3058     part_id = ps_search_node->u1_part_id;
3059     for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
3060     {
3061         e_min_id = PT_C;
3062 
3063         mvx_qpel = i4_mv_x << 1;
3064         mvy_qpel = i4_mv_y << 1;
3065 
3066         /* Central pt */
3067         if(i4_grid_mask & BIT_EN(PT_C))
3068         {
3069             //ps_search_node->i2_mv_x = (S16)i4_mv_x;
3070             //ps_search_node->i2_mv_x = (S16)i4_mv_y;
3071             /* central pt is i4_mv_x, i4_mv_y */
3072             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3073                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
3074 
3075             i4_frac_x = i4_mv_x & 1;
3076             i4_frac_y = i4_mv_y & 1;
3077             pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3078             s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3079             pf_err_compute(&s_err_prms);
3080             /* Update the mv's with the current candt motion vectors */
3081             s_result_prms.i2_mv_x = mvx_qpel;
3082             s_result_prms.i2_mv_y = mvy_qpel;
3083             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3084             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3085             if(i4_tot_cost < i4_min_cost)
3086             {
3087                 i4_min_cost = i4_tot_cost;
3088                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3089                 e_min_id = PT_C;
3090                 pu1_final_out = s_err_prms.pu1_ref;
3091             }
3092         }
3093 
3094         /* left pt */
3095         if(i4_grid_mask & BIT_EN(PT_L))
3096         {
3097             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3098                 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
3099 
3100             if(!check_for_duplicate)
3101             {
3102                 /* search node mv is stored in qpel units */
3103                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
3104                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3105                 /* central pt is i4_mv_x - 1, i4_mv_y */
3106                 i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
3107                 i4_frac_y = i4_mv_y & 1;
3108                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3109                 s_err_prms.pu1_ref =
3110                     pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3111 
3112                 pf_err_compute(&s_err_prms);
3113                 /* Update the mv's with the current candt motion vectors */
3114                 s_result_prms.i2_mv_x = mvx_qpel;
3115                 s_result_prms.i2_mv_y = mvy_qpel;
3116                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3117 
3118                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3119 
3120                 if(i4_tot_cost < i4_min_cost)
3121                 {
3122                     i4_min_cost = i4_tot_cost;
3123                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3124                     e_min_id = PT_L;
3125                     pu1_final_out = s_err_prms.pu1_ref;
3126                 }
3127             }
3128         }
3129         /* top pt */
3130         if(i4_grid_mask & BIT_EN(PT_T))
3131         {
3132             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3133                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
3134 
3135             if(!check_for_duplicate)
3136             {
3137                 /* search node mv is stored in qpel units */
3138                 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3139                 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
3140                 /* top pt is i4_mv_x, i4_mv_y - 1 */
3141                 i4_frac_x = i4_mv_x & 1;
3142                 i4_frac_y = (i4_mv_y - 1) & 1;
3143                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3144                 s_err_prms.pu1_ref =
3145                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
3146                 pf_err_compute(&s_err_prms);
3147                 /* Update the mv's with the current candt motion vectors */
3148                 s_result_prms.i2_mv_x = mvx_qpel;
3149                 s_result_prms.i2_mv_y = mvy_qpel - 2;
3150                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3151 
3152                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3153 
3154                 if(i4_tot_cost < i4_min_cost)
3155                 {
3156                     i4_min_cost = i4_tot_cost;
3157                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3158                     e_min_id = PT_T;
3159                     pu1_final_out = s_err_prms.pu1_ref;
3160                 }
3161             }
3162         }
3163         /* right pt */
3164         if(i4_grid_mask & BIT_EN(PT_R))
3165         {
3166             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3167                 ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
3168 
3169             if(!check_for_duplicate)
3170             {
3171                 /* search node mv is stored in qpel units */
3172                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
3173                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3174                 /* right pt is i4_mv_x + 1, i4_mv_y */
3175                 i4_frac_x = (i4_mv_x + 1) & 1;
3176                 i4_frac_y = i4_mv_y & 1;
3177 
3178                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3179                 s_err_prms.pu1_ref =
3180                     pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3181                 pf_err_compute(&s_err_prms);
3182                 /* Update the mv's with the current candt motion vectors */
3183                 s_result_prms.i2_mv_x = mvx_qpel + 2;
3184                 s_result_prms.i2_mv_y = mvy_qpel;
3185                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3186 
3187                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3188 
3189                 if(i4_tot_cost < i4_min_cost)
3190                 {
3191                     i4_min_cost = i4_tot_cost;
3192                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3193                     e_min_id = PT_R;
3194                     pu1_final_out = s_err_prms.pu1_ref;
3195                 }
3196             }
3197         }
3198         /* bottom pt */
3199         if(i4_grid_mask & BIT_EN(PT_B))
3200         {
3201             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3202                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
3203 
3204             if(!check_for_duplicate)
3205             {
3206                 /* search node mv is stored in qpel units */
3207                 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
3208                 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
3209                 i4_frac_x = i4_mv_x & 1;
3210                 i4_frac_y = (i4_mv_y + 1) & 1;
3211                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3212                 s_err_prms.pu1_ref =
3213                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
3214 
3215                 pf_err_compute(&s_err_prms);
3216                 /* Update the mv's with the current candt motion vectors */
3217                 s_result_prms.i2_mv_x = mvx_qpel;
3218                 s_result_prms.i2_mv_y = mvy_qpel + 2;
3219                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3220 
3221                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3222 
3223                 if(i4_tot_cost < i4_min_cost)
3224                 {
3225                     i4_min_cost = i4_tot_cost;
3226                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3227                     e_min_id = PT_B;
3228                     pu1_final_out = s_err_prms.pu1_ref;
3229                 }
3230             }
3231         }
3232         if(e_min_id == PT_C)
3233         {
3234             if(!i4_i)
3235             {
3236                 /* TL pt */
3237                 if(i4_grid_mask & BIT_EN(PT_TL))
3238                 {
3239                     S32 mvx_minus_1 = (i4_mv_x - 1);
3240                     S32 mvy_minus_1 = (i4_mv_y - 1);
3241 
3242                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3243                         ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel - 2, check_for_duplicate);
3244 
3245                     if(!check_for_duplicate)
3246                     {
3247                         /* search node mv is stored in qpel units */
3248                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3249                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3250                         i4_frac_x = mvx_minus_1 & 1;
3251                         i4_frac_y = mvy_minus_1 & 1;
3252                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3253                         s_err_prms.pu1_ref =
3254                             pu1_ref + (mvx_minus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3255 
3256                         pf_err_compute(&s_err_prms);
3257                         /* Update the mv's with the current candt motion vectors */
3258                         s_result_prms.i2_mv_x = mvx_qpel - 2;
3259                         s_result_prms.i2_mv_y = mvy_qpel - 2;
3260                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3261 
3262                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3263 
3264                         if(i4_tot_cost < i4_min_cost)
3265                         {
3266                             i4_min_cost = i4_tot_cost;
3267                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3268                             e_min_id = PT_TL;
3269                             pu1_final_out = s_err_prms.pu1_ref;
3270                         }
3271                     }
3272                 }
3273                 /* TR pt */
3274                 if(i4_grid_mask & BIT_EN(PT_TR))
3275                 {
3276                     S32 mvx_plus_1 = (i4_mv_x + 1);
3277                     S32 mvy_minus_1 = (i4_mv_y - 1);
3278 
3279                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3280                         ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel - 2, check_for_duplicate);
3281 
3282                     if(!check_for_duplicate)
3283                     {
3284                         /* search node mv is stored in qpel units */
3285                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3286                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3287                         i4_frac_x = mvx_plus_1 & 1;
3288                         i4_frac_y = mvy_minus_1 & 1;
3289                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3290                         s_err_prms.pu1_ref =
3291                             pu1_ref + (mvx_plus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3292 
3293                         pf_err_compute(&s_err_prms);
3294                         /* Update the mv's with the current candt motion vectors */
3295                         s_result_prms.i2_mv_x = mvx_qpel + 2;
3296                         s_result_prms.i2_mv_y = mvy_qpel - 2;
3297                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3298 
3299                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3300 
3301                         if(i4_tot_cost < i4_min_cost)
3302                         {
3303                             i4_min_cost = i4_tot_cost;
3304                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3305                             e_min_id = PT_TR;
3306                             pu1_final_out = s_err_prms.pu1_ref;
3307                         }
3308                     }
3309                 }
3310                 /* BL pt */
3311                 if(i4_grid_mask & BIT_EN(PT_BL))
3312                 {
3313                     S32 mvx_minus_1 = (i4_mv_x - 1);
3314                     S32 mvy_plus_1 = (i4_mv_y + 1);
3315 
3316                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3317                         ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel + 2, check_for_duplicate);
3318 
3319                     if(!check_for_duplicate)
3320                     {
3321                         /* search node mv is stored in qpel units */
3322                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3323                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3324                         i4_frac_x = mvx_minus_1 & 1;
3325                         i4_frac_y = mvy_plus_1 & 1;
3326                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3327                         s_err_prms.pu1_ref =
3328                             pu1_ref + (mvx_minus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3329 
3330                         pf_err_compute(&s_err_prms);
3331                         /* Update the mv's with the current candt motion vectors */
3332                         s_result_prms.i2_mv_x = mvx_qpel - 2;
3333                         s_result_prms.i2_mv_y = mvy_qpel + 2;
3334                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3335 
3336                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3337 
3338                         if(i4_tot_cost < i4_min_cost)
3339                         {
3340                             i4_min_cost = i4_tot_cost;
3341                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3342                             e_min_id = PT_BL;
3343                             pu1_final_out = s_err_prms.pu1_ref;
3344                         }
3345                     }
3346                 }
3347                 /* BR pt */
3348                 if(i4_grid_mask & BIT_EN(PT_BR))
3349                 {
3350                     S32 mvx_plus_1 = (i4_mv_x + 1);
3351                     S32 mvy_plus_1 = (i4_mv_y + 1);
3352                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3353                         ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel + 2, check_for_duplicate);
3354 
3355                     if(!check_for_duplicate)
3356                     {
3357                         /* search node mv is stored in qpel units */
3358                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3359                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3360                         i4_frac_x = mvx_plus_1 & 1;
3361                         i4_frac_y = mvy_plus_1 & 1;
3362                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3363                         s_err_prms.pu1_ref =
3364                             pu1_ref + (mvx_plus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3365 
3366                         pf_err_compute(&s_err_prms);
3367                         /* Update the mv's with the current candt motion vectors */
3368                         s_result_prms.i2_mv_x = mvx_qpel + 2;
3369                         s_result_prms.i2_mv_y = mvy_qpel + 2;
3370                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3371 
3372                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3373 
3374                         if(i4_tot_cost < i4_min_cost)
3375                         {
3376                             i4_min_cost = i4_tot_cost;
3377                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3378                             e_min_id = PT_BR;
3379                             pu1_final_out = s_err_prms.pu1_ref;
3380                         }
3381                     }
3382                 }
3383                 if(e_min_id == PT_C)
3384                 {
3385                     break;
3386                 }
3387             }
3388             else
3389             {
3390                 break;
3391             }
3392         }
3393 
3394         /*********************************************************************/
3395         /* Depending on the best result location, we may be able to skip     */
3396         /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
3397         /* the best result, the next iteration need not do centre, left pts  */
3398         /*********************************************************************/
3399         if(i4_i)
3400         {
3401             i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3402         }
3403         else
3404         {
3405             i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3406         }
3407         i4_mv_x += gai1_grid_id_to_x[e_min_id];
3408         i4_mv_y += gai1_grid_id_to_y[e_min_id];
3409         ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3410         ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3411         i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3412     }
3413 
3414     /* Convert to QPEL units */
3415     i4_mv_x <<= 1;
3416     i4_mv_y <<= 1;
3417 
3418     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3419     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3420 
3421     /* Early exit if this partition is visiting same hpel mv again */
3422     /* Assumption : Checkin for early exit in best result of partition */
3423     if((ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x ==
3424         ps_search_node->s_mv.i2_mvx) &&
3425        (ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y ==
3426         ps_search_node->s_mv.i2_mvy))
3427     {
3428         return (ps_search_results->aps_part_results[search_idx][part_id][0].i4_tot_cost);
3429     }
3430     else
3431     {
3432         /* Store the best hpel mv for future early exit checks */
3433         ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x =
3434             (S16)i4_mv_x;
3435         ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y =
3436             (S16)i4_mv_y;
3437     }
3438 
3439     /* Early exit if this partition is visiting same hpel mv again */
3440     /* Assumption : Checkin for early exit in second best result of partition */
3441     if((ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x ==
3442         ps_search_node->s_mv.i2_mvx) &&
3443        (ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y ==
3444         ps_search_node->s_mv.i2_mvy))
3445     {
3446         return (ps_search_results->aps_part_results[search_idx][part_id][1].i4_tot_cost);
3447     }
3448     else
3449     {
3450         /* Store the best hpel mv for future early exit checks */
3451         ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x =
3452             (S16)i4_mv_x;
3453         ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y =
3454             (S16)i4_mv_y;
3455     }
3456 
3457     /* Exact interpolation or averaging chosen here */
3458     pf_qpel_interp = ps_prms->pf_qpel_interp;
3459 
3460     /* Next QPEL ME */
3461     /* In this case, we have option of doing exact QPEL interpolation or avg */
3462     /*************************************************************************/
3463     /*        x                                                              */
3464     /*    A b C d                                                            */
3465     /*    e f g h                                                            */
3466     /*    I j K l                                                            */
3467     /*    m n o p                                                            */
3468     /*    Q r S t                                                            */
3469     /*                                                                       */
3470     /*    Approximate QPEL logic                                             */
3471     /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
3472     /*    for any given pt, we can get all the information required about    */
3473     /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
3474     /*     surrounding pts info:                                             */
3475     /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
3476     /*           buffer 2: hxfy, offsets for both are 0, 0                   */
3477     /*    similarly for other pts the info can be gotten                     */
3478     /*************************************************************************/
3479     i4_grid_mask = GRID_ALL_PTS_VALID ^ (BIT_EN(PT_C));
3480     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3481 
3482     /*************************************************************************/
3483     /* One time preparation of non changing interpolation params. These      */
3484     /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
3485     /* working memory (not used though in case of averaging).                */
3486     /*************************************************************************/
3487     s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
3488     s_interp_prms.i4_ref_stride = i4_ref_stride;
3489     s_interp_prms.i4_blk_wd = i4_blk_wd;
3490     s_interp_prms.i4_blk_ht = i4_blk_ht;
3491 
3492     i4_final_out_stride = i4_ref_stride;
3493 
3494     {
3495         U08 *pu1_mem;
3496         /*********************************************************************/
3497         /* Allocation of working memory for interpolated buffers. We maintain*/
3498         /* an intermediate working buffer, and 2 ping pong interpolated out  */
3499         /* buffers, purpose of ping pong explained later below               */
3500         /*********************************************************************/
3501         pu1_mem = ps_prms->pu1_wkg_mem;
3502         s_interp_prms.pu1_wkg_mem = pu1_mem;
3503 
3504         //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
3505         s_interp_prms.apu1_interp_out[0] = pu1_mem;
3506 
3507         pu1_mem += (INTERP_OUT_BUF_SIZE);
3508         s_interp_prms.apu1_interp_out[1] = pu1_mem;
3509 
3510         pu1_mem += (INTERP_OUT_BUF_SIZE);
3511         s_interp_prms.apu1_interp_out[2] = pu1_mem;
3512 
3513         pu1_mem += (INTERP_OUT_BUF_SIZE);
3514         s_interp_prms.apu1_interp_out[3] = pu1_mem;
3515 
3516         pu1_mem += (INTERP_OUT_BUF_SIZE);
3517         s_interp_prms.apu1_interp_out[4] = pu1_mem;
3518 
3519         /*********************************************************************/
3520         /* Stride of interpolated output is just a function of blk width of  */
3521         /* this partition and hence remains constant for this partition      */
3522         /*********************************************************************/
3523         s_interp_prms.i4_out_stride = (i4_blk_wd);
3524     }
3525 
3526     {
3527         UWORD8 *apu1_final[4];
3528         WORD32 ai4_ref_stride[4];
3529         /*************************************************************************/
3530         /* Ping pong design for interpolated buffers. We use a min id, which     */
3531         /* tracks the id of the ppu1_interp_out that stores the best result.     */
3532         /* When new interp to be done, it uses 1 - bes result id to do the interp*/
3533         /* min id is toggled when any new result becomes the best result.        */
3534         /*************************************************************************/
3535 
3536         for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
3537         {
3538             e_min_id = PT_C;
3539 
3540             hme_qpel_interp_comprehensive(
3541                 &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask);
3542 
3543             mvx_qpel = i4_mv_x;
3544             mvy_qpel = i4_mv_y;
3545 
3546             if(i4_grid_mask & BIT_EN(PT_L))
3547             {
3548                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3549                     ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate);
3550 
3551                 if(!check_for_duplicate)
3552                 {
3553                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3554                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3555 
3556                     s_err_prms.pu1_ref = apu1_final[0];
3557                     s_err_prms.i4_ref_stride = ai4_ref_stride[0];
3558 
3559                     pf_err_compute(&s_err_prms);
3560                     /* Update the mv's with the current candt motion vectors */
3561                     s_result_prms.i2_mv_x = mvx_qpel - 1;
3562                     s_result_prms.i2_mv_y = mvy_qpel;
3563                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3564 
3565                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3566                     if(i4_tot_cost < i4_min_cost)
3567                     {
3568                         e_min_id = PT_L;
3569                         i4_min_cost = i4_tot_cost;
3570                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3571                     }
3572                 }
3573             }
3574             if(i4_grid_mask & BIT_EN(PT_T))
3575             {
3576                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3577                     ps_dedup_enabler, 1, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate);
3578 
3579                 if(!check_for_duplicate)
3580                 {
3581                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3582                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3583 
3584                     s_err_prms.pu1_ref = apu1_final[1];
3585                     s_err_prms.i4_ref_stride = ai4_ref_stride[1];
3586 
3587                     pf_err_compute(&s_err_prms);
3588                     /* Update the mv's with the current candt motion vectors */
3589                     s_result_prms.i2_mv_x = mvx_qpel;
3590                     s_result_prms.i2_mv_y = mvy_qpel - 1;
3591                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3592                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3593                     if(i4_tot_cost < i4_min_cost)
3594                     {
3595                         e_min_id = PT_T;
3596                         i4_min_cost = i4_tot_cost;
3597                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3598                     }
3599                 }
3600             }
3601             if(i4_grid_mask & BIT_EN(PT_R))
3602             {
3603                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3604                     ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
3605 
3606                 if(!check_for_duplicate)
3607                 {
3608                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3609                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3610 
3611                     s_err_prms.pu1_ref = apu1_final[2];
3612                     s_err_prms.i4_ref_stride = ai4_ref_stride[2];
3613 
3614                     pf_err_compute(&s_err_prms);
3615                     /* Update the mv's with the current candt motion vectors */
3616                     s_result_prms.i2_mv_x = mvx_qpel + 1;
3617                     s_result_prms.i2_mv_y = mvy_qpel;
3618                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3619 
3620                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3621                     if(i4_tot_cost < i4_min_cost)
3622                     {
3623                         e_min_id = PT_R;
3624                         i4_min_cost = i4_tot_cost;
3625                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3626                     }
3627                 }
3628             }
3629             /* i4_mv_x and i4_mv_y will always be the centre pt */
3630             /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3631             if(i4_grid_mask & BIT_EN(PT_B))
3632             {
3633                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3634                     ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
3635 
3636                 if(!check_for_duplicate)
3637                 {
3638                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3639                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3640 
3641                     s_err_prms.pu1_ref = apu1_final[3];
3642                     s_err_prms.i4_ref_stride = ai4_ref_stride[3];
3643 
3644                     pf_err_compute(&s_err_prms);
3645                     /* Update the mv's with the current candt motion vectors */
3646                     s_result_prms.i2_mv_x = mvx_qpel;
3647                     s_result_prms.i2_mv_y = mvy_qpel + 1;
3648                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3649                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3650                     if(i4_tot_cost < i4_min_cost)
3651                     {
3652                         e_min_id = PT_B;
3653                         i4_min_cost = i4_tot_cost;
3654                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3655                     }
3656                 }
3657             }
3658 
3659             if(e_min_id == PT_C)
3660             {
3661                 if(!i4_i)
3662                 {
3663                     S32 i4_interp_buf_id = 0;
3664 
3665                     if(i4_grid_mask & BIT_EN(PT_TL))
3666                     {
3667                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3668                             ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 1, check_for_duplicate);
3669 
3670                         if(!check_for_duplicate)
3671                         {
3672                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3673                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3674 
3675                             /* Carry out the interpolation */
3676                             pf_qpel_interp(
3677                                 &s_interp_prms, i4_mv_x - 1, i4_mv_y - 1, i4_interp_buf_id);
3678 
3679                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3680                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3681 
3682                             pf_err_compute(&s_err_prms);
3683                             /* Update the mv's with the current candt motion vectors */
3684                             s_result_prms.i2_mv_x = mvx_qpel - 1;
3685                             s_result_prms.i2_mv_y = mvy_qpel - 1;
3686                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3687 
3688                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3689 
3690                             if(i4_tot_cost < i4_min_cost)
3691                             {
3692                                 e_min_id = PT_TL;
3693                                 i4_min_cost = i4_tot_cost;
3694                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3695                             }
3696                         }
3697                     }
3698                     if(i4_grid_mask & BIT_EN(PT_TR))
3699                     {
3700                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3701                             ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel - 1, check_for_duplicate);
3702 
3703                         if(!check_for_duplicate)
3704                         {
3705                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3706                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3707 
3708                             /* Carry out the interpolation */
3709                             pf_qpel_interp(
3710                                 &s_interp_prms, i4_mv_x + 1, i4_mv_y - 1, i4_interp_buf_id);
3711 
3712                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3713                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3714 
3715                             pf_err_compute(&s_err_prms);
3716                             /* Update the mv's with the current candt motion vectors */
3717                             s_result_prms.i2_mv_x = mvx_qpel + 1;
3718                             s_result_prms.i2_mv_y = mvy_qpel - 1;
3719                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3720 
3721                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3722 
3723                             if(i4_tot_cost < i4_min_cost)
3724                             {
3725                                 e_min_id = PT_TR;
3726                                 i4_min_cost = i4_tot_cost;
3727                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3728                             }
3729                         }
3730                     }
3731                     if(i4_grid_mask & BIT_EN(PT_BL))
3732                     {
3733                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3734                             ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel + 1, check_for_duplicate);
3735 
3736                         if(!check_for_duplicate)
3737                         {
3738                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3739                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3740 
3741                             /* Carry out the interpolation */
3742                             pf_qpel_interp(
3743                                 &s_interp_prms, i4_mv_x - 1, i4_mv_y + 1, i4_interp_buf_id);
3744 
3745                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3746                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3747 
3748                             pf_err_compute(&s_err_prms);
3749                             /* Update the mv's with the current candt motion vectors */
3750                             s_result_prms.i2_mv_x = mvx_qpel - 1;
3751                             s_result_prms.i2_mv_y = mvy_qpel + 1;
3752                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3753 
3754                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3755 
3756                             if(i4_tot_cost < i4_min_cost)
3757                             {
3758                                 e_min_id = PT_BL;
3759                                 i4_min_cost = i4_tot_cost;
3760                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3761                             }
3762                         }
3763                     }
3764                     /* i4_mv_x and i4_mv_y will always be the centre pt */
3765                     /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3766                     if(i4_grid_mask & BIT_EN(PT_BR))
3767                     {
3768                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3769                             ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel + 1, check_for_duplicate);
3770 
3771                         if(!check_for_duplicate)
3772                         {
3773                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3774                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3775 
3776                             /* Carry out the interpolation */
3777                             pf_qpel_interp(
3778                                 &s_interp_prms, i4_mv_x + 1, i4_mv_y + 1, i4_interp_buf_id);
3779 
3780                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3781                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3782 
3783                             pf_err_compute(&s_err_prms);
3784                             /* Update the mv's with the current candt motion vectors */
3785                             s_result_prms.i2_mv_x = mvx_qpel + 1;
3786                             s_result_prms.i2_mv_y = mvy_qpel + 1;
3787                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3788 
3789                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3790 
3791                             if(i4_tot_cost < i4_min_cost)
3792                             {
3793                                 e_min_id = PT_BR;
3794                                 i4_min_cost = i4_tot_cost;
3795                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3796                             }
3797                         }
3798                     }
3799                     if(e_min_id == PT_C)
3800                     {
3801                         break;
3802                     }
3803                 }
3804                 else
3805                 {
3806                     break;
3807                 }
3808             }
3809 
3810             if(i4_i)
3811             {
3812                 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3813             }
3814             else
3815             {
3816                 i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3817             }
3818             i4_mv_x += gai1_grid_id_to_x[e_min_id];
3819             i4_mv_y += gai1_grid_id_to_y[e_min_id];
3820             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3821             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3822             i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3823         }
3824     }
3825 
3826     /* update modified motion vectors and cost at end of subpel */
3827     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3828     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3829     ps_search_node->i4_tot_cost = i4_min_cost;
3830     ps_search_node->i4_sad = i4_min_sad;
3831 
3832     /********************************************************************************/
3833     /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
3834     /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
3835     /********************************************************************************/
3836     //ps_pred_ctxt->lambda >>= 1;
3837 
3838     return (i4_min_cost);
3839 }
3840 #endif
3841 
hme_subpel_refine_struct_to_search_results_struct_converter(subpel_refine_ctxt_t * ps_subpel_refine_ctxt,search_results_t * ps_search_results,U08 u1_pred_dir,ME_QUALITY_PRESETS_T e_quality_preset)3842 static void hme_subpel_refine_struct_to_search_results_struct_converter(
3843     subpel_refine_ctxt_t *ps_subpel_refine_ctxt,
3844     search_results_t *ps_search_results,
3845     U08 u1_pred_dir,
3846     ME_QUALITY_PRESETS_T e_quality_preset)
3847 {
3848     U08 i;
3849 
3850     U08 u1_num_results_per_part = ps_search_results->u1_num_results_per_part;
3851 
3852     for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
3853     {
3854         S32 index;
3855         S32 i4_sad;
3856 
3857         S32 part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
3858 
3859         search_node_t *ps_best_node = ps_search_results->aps_part_results[u1_pred_dir][part_id];
3860 
3861         if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
3862         {
3863             index = part_id;
3864         }
3865         else
3866         {
3867             index = i;
3868         }
3869 
3870         if(!ps_best_node->u1_subpel_done)
3871         {
3872             i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3873                      ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3874             ps_best_node[0].i4_sdi = 0;
3875             ASSERT((e_quality_preset == ME_PRISTINE_QUALITY) ? (ps_best_node[0].i4_sdi >= 0) : 1);
3876             ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3877 
3878             if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3879             {
3880                 i4_sad = MAX_SIGNED_16BIT_VAL;
3881             }
3882 
3883             ps_best_node[0].i4_sad = i4_sad;
3884             ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3885             ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3886             ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3887             ps_best_node[0].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3888             ps_best_node->u1_subpel_done = 1;
3889 
3890             if(2 == u1_num_results_per_part)
3891             {
3892                 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3893                          ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3894                 ps_best_node[1].i4_sdi = 0;
3895                 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3896 
3897                 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3898                 {
3899                     i4_sad = MAX_SIGNED_16BIT_VAL;
3900                 }
3901 
3902                 ps_best_node[1].i4_sad = i4_sad;
3903                 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3904                 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3905                 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3906                 ps_best_node[1].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3907                 ps_best_node[1].u1_subpel_done = 1;
3908             }
3909         }
3910         else if(
3911             (2 == u1_num_results_per_part) &&
3912             (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[1].i4_tot_cost))
3913         {
3914             if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] < ps_best_node[0].i4_tot_cost)
3915             {
3916                 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3917                          ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3918                 ps_best_node[0].i4_sdi = 0;
3919                 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3920 
3921                 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3922                 {
3923                     i4_sad = MAX_SIGNED_16BIT_VAL;
3924                 }
3925 
3926                 ps_best_node[0].i4_sad = i4_sad;
3927                 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3928                 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3929                 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3930                 ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3931 
3932                 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3933                          ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3934                 ps_best_node[1].i4_sdi = 0;
3935                 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3936 
3937                 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3938                 {
3939                     i4_sad = MAX_SIGNED_16BIT_VAL;
3940                 }
3941 
3942                 ps_best_node[1].i4_sad = i4_sad;
3943                 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3944                 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3945                 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3946                 ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3947             }
3948             else if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] > ps_best_node[0].i4_tot_cost)
3949             {
3950                 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >= ps_best_node[0].i4_tot_cost)
3951                 {
3952                     i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3953                              ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3954                     ps_best_node[1].i4_sdi = 0;
3955                     ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3956 
3957                     if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3958                     {
3959                         i4_sad = MAX_SIGNED_16BIT_VAL;
3960                     }
3961 
3962                     ps_best_node[1].i4_sad = i4_sad;
3963                     ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3964                     ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3965                     ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3966                     ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3967                 }
3968                 else if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost)
3969                 {
3970                     memmove(&ps_best_node[1], &ps_best_node[0], sizeof(search_node_t));
3971 
3972                     i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3973                              ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3974                     ps_best_node[0].i4_sdi = 0;
3975                     ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3976 
3977                     if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3978                     {
3979                         i4_sad = MAX_SIGNED_16BIT_VAL;
3980                     }
3981 
3982                     ps_best_node[0].i4_sad = i4_sad;
3983                     ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3984                     ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3985                     ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3986                     ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3987                 }
3988             }
3989         }
3990         else if(
3991             (1 == u1_num_results_per_part) &&
3992             (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost))
3993         {
3994             i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3995                      ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3996             ps_best_node[0].i4_sdi = 0;
3997             ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3998 
3999             if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
4000             {
4001                 i4_sad = MAX_SIGNED_16BIT_VAL;
4002             }
4003 
4004             ps_best_node[0].i4_sad = i4_sad;
4005             ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4006             ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4007             ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4008             ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
4009         }
4010     }
4011 }
4012 
4013 /**
4014 ********************************************************************************
4015 *  @fn     S32 hme_subpel_refine_cu_hs
4016 *
4017 *  @brief  Evaluates the best subpel mvs for active partitions of an MB in L0
4018 *          layer for the high speed preset. Recursive hadamard SATD / SAD
4019 *          and mv cost is used for 2NxN and NxN partitions with active partition
4020 *          update
4021 *
4022 *  @param[in]  ps_prms: subpel prms input to this function
4023 *
4024 *  @param[in]  ps_curr_layer: points to the current layer ctxt
4025 *
4026 *  @param[out] ps_search_results: points to the search resutls that get updated
4027 *              with best results
4028 *
4029 *  @param[in]  search_idx:  ref id of the frame for which results get updated
4030 *
4031 *  @param[in]  ps_wt_inp_prms:  current frame input params
4032 *
4033 *  @return     None
4034 ********************************************************************************
4035 */
hme_subpel_refine_cu_hs(hme_subpel_prms_t * ps_prms,layer_ctxt_t * ps_curr_layer,search_results_t * ps_search_results,S32 search_idx,wgt_pred_ctxt_t * ps_wt_inp_prms,WORD32 blk_8x8_mask,me_func_selector_t * ps_func_selector,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)4036 void hme_subpel_refine_cu_hs(
4037     hme_subpel_prms_t *ps_prms,
4038     layer_ctxt_t *ps_curr_layer,
4039     search_results_t *ps_search_results,
4040     S32 search_idx,
4041     wgt_pred_ctxt_t *ps_wt_inp_prms,
4042     WORD32 blk_8x8_mask,
4043     me_func_selector_t *ps_func_selector,
4044     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
4045     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
4046 {
4047     /* Unique search node list for 2nx2n and nxn partitions */
4048     search_node_t as_nodes_2nx2n[MAX_RESULTS_PER_PART * 5];
4049     subpel_dedup_enabler_t as_subpel_dedup_enabler[MAX_NUM_REF];
4050     search_node_t *ps_search_node;
4051 
4052     S32 i, i4_part_mask, j;
4053     S32 i4_sad_grid;
4054     S32 max_subpel_cand;
4055     WORD32 index;
4056     S32 num_unique_nodes_2nx2n;
4057     S32 part_id;
4058     S32 x_off, y_off;
4059     S32 i4_inp_off;
4060 
4061     CU_SIZE_T e_cu_size;
4062     BLK_SIZE_T e_blk_size;
4063 
4064     subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
4065 
4066     S32 i4_use_satd = ps_prms->i4_use_satd;
4067     S32 i4_num_act_refs = ps_prms->i4_num_act_ref_l0 + ps_prms->i4_num_act_ref_l1;
4068 
4069     ASSERT(ps_search_results->u1_num_results_per_part <= MAX_RESULTS_PER_PART);
4070 
4071     if(!DISABLE_SUBPEL_REFINEMENT_WHEN_SRC_IS_NOISY || !ps_prms->u1_is_cu_noisy)
4072     {
4073         e_cu_size = ps_search_results->e_cu_size;
4074         i4_part_mask = ps_search_results->i4_part_mask;
4075 
4076         ps_prms->i4_inp_type = sizeof(U08);
4077 
4078         num_unique_nodes_2nx2n = 0;
4079 
4080         for(i = 0; i < i4_num_act_refs; i++)
4081         {
4082             as_subpel_dedup_enabler[i].u1_ref_idx = MAX_NUM_REF;
4083         }
4084 
4085         /************************************************************************/
4086         /*                                                                      */
4087         /*  Initialize SATD cost for each valid partition id.one time before    */
4088         /*  doing full pel time. This is because of the following reasons:      */
4089         /*   1. Full pel cost was done in  SAD while subpel is in SATD mode     */
4090         /*   2. Partitions like AMP, Nx2N and 2NxN are refined on the fly while */
4091         /*      doing Diamond search for 2Nx2N and NxN. This partitions are     */
4092         /*      not explicitly refine in high speed mode                        */
4093         /*                                                                      */
4094         /************************************************************************/
4095         for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4096         {
4097             S32 enable_subpel = 0;
4098             S32 part_type;
4099 
4100             /* Derive the x and y offsets of this part id */
4101             part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4102             if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4103             {
4104                 index = part_id;
4105             }
4106             else
4107             {
4108                 index = i;
4109             }
4110 
4111             part_type = ge_part_id_to_part_type[part_id];
4112             x_off = gas_part_attr_in_cu[part_id].u1_x_start << e_cu_size;
4113             y_off = gas_part_attr_in_cu[part_id].u1_y_start << e_cu_size;
4114             x_off += ps_search_results->u1_x_off;
4115             y_off += ps_search_results->u1_y_off;
4116             i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4117             e_blk_size = ge_part_id_to_blk_size[e_cu_size][part_id];
4118 
4119             x_off += ps_prms->i4_ctb_x_off;
4120             y_off += ps_prms->i4_ctb_y_off;
4121 
4122             max_subpel_cand = 0;
4123 
4124             /* Choose the minimum number of candidates to be used for Sub pel refinement */
4125             if(PART_ID_2Nx2N == part_type)
4126             {
4127                 max_subpel_cand =
4128                     MIN(ps_prms->u1_max_subpel_candts_2Nx2N,
4129                         ps_search_results->u1_num_results_per_part);
4130             }
4131             else if(PRT_NxN == part_type)
4132             {
4133                 max_subpel_cand = MIN(
4134                     ps_prms->u1_max_subpel_candts_NxN, ps_search_results->u1_num_results_per_part);
4135             }
4136 
4137             /* If incomplete CTB, NxN num candidates should be forced to min 1 */
4138             if((0 == max_subpel_cand) && (blk_8x8_mask != 15))
4139             {
4140                 max_subpel_cand = 1;
4141             }
4142 
4143             if((PART_ID_2Nx2N == part_type) || (PRT_NxN == part_type))
4144             {
4145                 enable_subpel = 1;
4146             }
4147 
4148             /* Compute full pel SATD for each result per partition before subpel */
4149             /* refinement starts.                                                */
4150             /* Also prepare unique candidate list for 2Nx2N and NxN partitions   */
4151             for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4152             {
4153                 err_prms_t s_err_prms;
4154                 S32 i4_satd = 0;
4155                 S32 i1_ref_idx;
4156                 U08 *pu1_ref_base;
4157                 S32 i4_ref_stride = ps_curr_layer->i4_rec_stride;
4158                 S32 i4_mv_x, i4_mv_y;
4159 
4160                 ps_search_node = ps_search_results->aps_part_results[search_idx][part_id] + j;
4161 
4162                 if(ps_subpel_refine_ctxt->i2_mv_x[j][index] == INTRA_MV)
4163                 {
4164                     ps_search_node->u1_subpel_done = 1;
4165                     continue;
4166                 }
4167 
4168                 i1_ref_idx = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4169                 ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[i1_ref_idx] + i4_inp_off);
4170                 pu1_ref_base = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx];
4171 
4172                 i4_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index];
4173                 i4_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index];
4174 
4175                 if(i4_use_satd)
4176                 {
4177                     s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
4178                     s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
4179                     s_err_prms.pu1_ref = pu1_ref_base + x_off + (y_off * i4_ref_stride) + i4_mv_x +
4180                                          (i4_mv_y * i4_ref_stride);
4181 
4182                     s_err_prms.i4_ref_stride = i4_ref_stride;
4183                     s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
4184                     s_err_prms.i4_grid_mask = 1;
4185                     s_err_prms.pi4_sad_grid = &i4_sad_grid;
4186                     s_err_prms.i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
4187                     s_err_prms.i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
4188 
4189                     s_err_prms.ps_cmn_utils_optimised_function_list =
4190                         ps_cmn_utils_optimised_function_list;
4191 
4192                     compute_satd_8bit(&s_err_prms);
4193 
4194                     i4_satd = s_err_prms.pi4_sad_grid[0];
4195 
4196                     ps_subpel_refine_ctxt->i2_tot_cost[j][index] =
4197                         CLIP_S16(ps_subpel_refine_ctxt->i2_mv_cost[j][index] + i4_satd);
4198                     ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index] = i4_satd;
4199                 }
4200 
4201                 /* Sub-pel candidate filtration */
4202                 if(j)
4203                 {
4204                     S16 i2_best_sad;
4205                     S32 i4_best_mvx;
4206                     S32 i4_best_mvy;
4207 
4208                     search_node_t *ps_node =
4209                         ps_search_results->aps_part_results[search_idx][part_id];
4210 
4211                     U08 u1_is_subpel_done = ps_node->u1_subpel_done;
4212                     S16 i2_curr_sad = ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index];
4213                     S32 i4_curr_mvx = i4_mv_x << 2;
4214                     S32 i4_curr_mvy = i4_mv_y << 2;
4215 
4216                     if(u1_is_subpel_done)
4217                     {
4218                         i2_best_sad = ps_node->i4_sad;
4219 
4220                         if(ps_node->i1_ref_idx == i1_ref_idx)
4221                         {
4222                             i4_best_mvx = ps_node->s_mv.i2_mvx;
4223                             i4_best_mvy = ps_node->s_mv.i2_mvy;
4224                         }
4225                         else if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4226                         {
4227                             i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4228                             i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4229                         }
4230                         else
4231                         {
4232                             i4_best_mvx = INTRA_MV;
4233                             i4_best_mvy = INTRA_MV;
4234                         }
4235                     }
4236                     else
4237                     {
4238                         i2_best_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
4239                                       ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4240 
4241                         if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4242                         {
4243                             i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4244                             i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4245                         }
4246                         else
4247                         {
4248                             i4_best_mvx = INTRA_MV;
4249                             i4_best_mvy = INTRA_MV;
4250                         }
4251                     }
4252 
4253                     i2_best_sad += (i2_best_sad >> ps_prms->u1_subpel_candt_threshold);
4254 
4255                     if(((ABS(i4_curr_mvx - i4_best_mvx) < 2) &&
4256                         (ABS(i4_curr_mvy - i4_best_mvy) < 2)) ||
4257                        (i2_curr_sad > i2_best_sad))
4258                     {
4259                         enable_subpel = 0;
4260                     }
4261                 }
4262 
4263                 ps_search_node->u1_part_id = part_id;
4264 
4265                 /* Convert mvs in part results from FPEL to QPEL units */
4266                 ps_subpel_refine_ctxt->i2_mv_x[j][index] <<= 2;
4267                 ps_subpel_refine_ctxt->i2_mv_y[j][index] <<= 2;
4268 
4269                 /* If the candidate number is more than the number of candts
4270                 set initally, do not add those candts for refinement */
4271                 if(j >= max_subpel_cand)
4272                 {
4273                     enable_subpel = 0;
4274                 }
4275 
4276                 if(enable_subpel)
4277                 {
4278                     if(num_unique_nodes_2nx2n == 0)
4279                     {
4280                         S32 i4_index = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4281 
4282                         as_subpel_dedup_enabler[i4_index].i2_mv_x =
4283                             ps_subpel_refine_ctxt->i2_mv_x[j][index];
4284                         as_subpel_dedup_enabler[i4_index].i2_mv_y =
4285                             ps_subpel_refine_ctxt->i2_mv_y[j][index];
4286                         as_subpel_dedup_enabler[i4_index].u1_ref_idx =
4287                             (U08)ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4288                         memset(
4289                             as_subpel_dedup_enabler[i4_index].au4_node_map,
4290                             0,
4291                             sizeof(U32) * 2 * MAP_X_MAX);
4292                     }
4293                     INSERT_NEW_NODE_NOMAP_ALTERNATE(
4294                         as_nodes_2nx2n, num_unique_nodes_2nx2n, ps_subpel_refine_ctxt, j, i);
4295                 }
4296             }
4297 
4298             /*********************************************************************************************/
4299             /* If sad_1 < sad_2, then satd_1 need not be lesser than satd_2. Therefore, after conversion */
4300             /* to satd, tot_cost_1 may not be lesser than tot_cost_2. So we need to sort the search nodes*/
4301             /* for each partition again, based on the new costs                                          */
4302             /*********************************************************************************************/
4303             /*********************************************************************************************/
4304             /* Because right now, we store only the two best candidates for each partition, the sort will*/
4305             /* converge to a simple swap.                                                                */
4306             /* ASSUMPTION : We store only two best results per partition                                 */
4307             /*********************************************************************************************/
4308             if(ps_search_results->u1_num_results_per_part == 2)
4309             {
4310                 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >
4311                    ps_subpel_refine_ctxt->i2_tot_cost[1][index])
4312                 {
4313                     SWAP(
4314                         ps_subpel_refine_ctxt->i2_tot_cost[0][index],
4315                         ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
4316 
4317                     SWAP(
4318                         ps_subpel_refine_ctxt->i2_mv_cost[0][index],
4319                         ps_subpel_refine_ctxt->i2_mv_cost[1][index]);
4320 
4321                     SWAP(
4322                         ps_subpel_refine_ctxt->i2_mv_x[0][index],
4323                         ps_subpel_refine_ctxt->i2_mv_x[1][index]);
4324 
4325                     SWAP(
4326                         ps_subpel_refine_ctxt->i2_mv_y[0][index],
4327                         ps_subpel_refine_ctxt->i2_mv_y[1][index]);
4328 
4329                     SWAP(
4330                         ps_subpel_refine_ctxt->i2_ref_idx[0][index],
4331                         ps_subpel_refine_ctxt->i2_ref_idx[1][index]);
4332 
4333                     SWAP(
4334                         ps_subpel_refine_ctxt->ai2_fullpel_satd[0][index],
4335                         ps_subpel_refine_ctxt->ai2_fullpel_satd[1][index]);
4336                 }
4337             }
4338         }
4339 
4340         if(blk_8x8_mask == 0xf)
4341         {
4342             num_unique_nodes_2nx2n =
4343                 MIN(num_unique_nodes_2nx2n, ps_prms->u1_max_num_subpel_refine_centers);
4344         }
4345         {
4346             x_off = gas_part_attr_in_cu[0].u1_x_start << e_cu_size;
4347             y_off = gas_part_attr_in_cu[0].u1_y_start << e_cu_size;
4348             x_off += ps_search_results->u1_x_off;
4349             y_off += ps_search_results->u1_y_off;
4350             i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4351             e_blk_size = ge_part_id_to_blk_size[e_cu_size][0];
4352 
4353             for(j = 0; j < num_unique_nodes_2nx2n; j++)
4354             {
4355                 S32 pred_lx;
4356                 ps_search_node = &as_nodes_2nx2n[j];
4357 
4358                 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
4359                 {
4360                     continue;
4361                 }
4362 
4363                 {
4364                     S08 i1_ref_idx = ps_search_node->i1_ref_idx;
4365                     subpel_dedup_enabler_t *ps_dedup_enabler =
4366                         &(as_subpel_dedup_enabler[i1_ref_idx]);
4367 
4368                     if(ps_dedup_enabler->u1_ref_idx == MAX_NUM_REF)
4369                     {
4370                         as_subpel_dedup_enabler[i1_ref_idx].i2_mv_x = ps_search_node->s_mv.i2_mvx;
4371                         as_subpel_dedup_enabler[i1_ref_idx].i2_mv_y = ps_search_node->s_mv.i2_mvy;
4372                         as_subpel_dedup_enabler[i1_ref_idx].u1_ref_idx = i1_ref_idx;
4373                         memset(
4374                             as_subpel_dedup_enabler[i1_ref_idx].au4_node_map,
4375                             0,
4376                             sizeof(U32) * 2 * MAP_X_MAX);
4377                     }
4378                 }
4379 
4380                 pred_lx = search_idx;
4381                 ps_prms->pv_inp =
4382                     (void *)(ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off);
4383 
4384                 hme_subpel_refine_search_node_high_speed(
4385                     ps_search_node,
4386                     ps_prms,
4387                     ps_curr_layer,
4388                     e_blk_size,
4389                     x_off + ps_prms->i4_ctb_x_off,
4390                     y_off + ps_prms->i4_ctb_y_off,
4391                     ps_search_results,
4392                     pred_lx,
4393                     i4_part_mask,
4394                     &ps_subpel_refine_ctxt->ai4_part_id[0],
4395                     search_idx,
4396                     &(as_subpel_dedup_enabler[ps_search_node->i1_ref_idx]),
4397                     ps_func_selector,
4398                     ps_me_optimised_function_list);
4399             }
4400         }
4401     }
4402     else
4403     {
4404         for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4405         {
4406             S32 i4_index;
4407 
4408             S32 i4_part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4409 
4410             if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4411             {
4412                 i4_index = i4_part_id;
4413             }
4414             else
4415             {
4416                 i4_index = i;
4417             }
4418 
4419             for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4420             {
4421                 ps_subpel_refine_ctxt->i2_mv_x[j][i4_index] <<= 2;
4422                 ps_subpel_refine_ctxt->i2_mv_y[j][i4_index] <<= 2;
4423             }
4424         }
4425     }
4426 
4427     hme_subpel_refine_struct_to_search_results_struct_converter(
4428         ps_subpel_refine_ctxt, ps_search_results, search_idx, ps_prms->e_me_quality_presets);
4429 }
4430