1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /*!
22 ***************************************************************************
23 * \file hme_err_compute.c
24 *
25 * \brief
26 *    SAD / SATD routines for error computation
27 *
28 * Detailed_description : Contains various types of SAD/SATD routines for
29 *   error computation between a given input and reference ptr. The SAD
30 *   routines can evaluate for either a single point or a grid, and can
31 *   evaluate with either partial updates or no partial updates. Partial
32 *   updates means evaluating sub block SADs, e.g. 4 4x4 subblock SAD in
33 *   addition to the main 8x8 block SAD.
34 *
35 * \date
36 *    22/9/2012
37 *
38 * \author  Ittiam
39 ***************************************************************************
40 */
41 
42 /*****************************************************************************/
43 /* File Includes                                                             */
44 /*****************************************************************************/
45 /* System include files */
46 #include <stdio.h>
47 #include <string.h>
48 #include <stdlib.h>
49 #include <assert.h>
50 #include <stdarg.h>
51 #include <math.h>
52 #include <limits.h>
53 
54 /* User include files */
55 #include "ihevc_typedefs.h"
56 #include "itt_video_api.h"
57 #include "ihevce_api.h"
58 
59 #include "rc_cntrl_param.h"
60 #include "rc_frame_info_collector.h"
61 #include "rc_look_ahead_params.h"
62 
63 #include "ihevc_defs.h"
64 #include "ihevc_structs.h"
65 #include "ihevc_platform_macros.h"
66 #include "ihevc_deblk.h"
67 #include "ihevc_itrans_recon.h"
68 #include "ihevc_chroma_itrans_recon.h"
69 #include "ihevc_chroma_intra_pred.h"
70 #include "ihevc_intra_pred.h"
71 #include "ihevc_inter_pred.h"
72 #include "ihevc_mem_fns.h"
73 #include "ihevc_padding.h"
74 #include "ihevc_weighted_pred.h"
75 #include "ihevc_sao.h"
76 #include "ihevc_resi_trans.h"
77 #include "ihevc_quant_iquant_ssd.h"
78 #include "ihevc_cabac_tables.h"
79 
80 #include "ihevce_defs.h"
81 #include "ihevce_lap_enc_structs.h"
82 #include "ihevce_multi_thrd_structs.h"
83 #include "ihevce_multi_thrd_funcs.h"
84 #include "ihevce_me_common_defs.h"
85 #include "ihevce_had_satd.h"
86 #include "ihevce_error_codes.h"
87 #include "ihevce_bitstream.h"
88 #include "ihevce_cabac.h"
89 #include "ihevce_rdoq_macros.h"
90 #include "ihevce_function_selector.h"
91 #include "ihevce_enc_structs.h"
92 #include "ihevce_entropy_structs.h"
93 #include "ihevce_cmn_utils_instr_set_router.h"
94 #include "ihevce_enc_loop_structs.h"
95 #include "ihevce_bs_compute_ctb.h"
96 #include "ihevce_global_tables.h"
97 #include "ihevce_dep_mngr_interface.h"
98 #include "hme_datatype.h"
99 #include "hme_interface.h"
100 #include "hme_common_defs.h"
101 #include "hme_defs.h"
102 #include "ihevce_me_instr_set_router.h"
103 #include "hme_globals.h"
104 #include "hme_utils.h"
105 #include "hme_coarse.h"
106 #include "hme_refine.h"
107 #include "hme_err_compute.h"
108 #include "hme_common_utils.h"
109 #include "hme_search_algo.h"
110 #include "ihevce_stasino_helpers.h"
111 
112 /******************************************************************************
113 *                         MACRO DEFINITIONS
114 ******************************************************************************/
115 
116 /*****************************************************************************/
117 /* Theoritically, the various types of SAD functions that are needed for     */
118 /* reasons of optimality. SADs that are to be evaluated at a single pt can be*/
119 /* more optimal than SADs that are to be evaluated for a grid of 3x3. The    */
120 /* SADs to be evaluated at a grid are classified as separate functions, since*/
121 /* evaluating them on a single function call helps reuse inputs for a small  */
122 /* grid of 3x3. Also, if no partial updates are required, there are 3 basic  */
123 /* funcitons, width 4K (K = odd number), width 8K (K = odd number) and width */
124 /* 16K, K any number. For partial updates, it is assumed that the block size */
125 /* is square (8x8, 16x16, 32x32, 64x64) and further differentiation is done  */
126 /* based on the basic evaluation unit. E.g. if 16x16 blk size requires, part */
127 /* update on AMP partitions, then basic SAD unit is 4x4, if it doesnt, then  */
128 /* basic SAD unit is 8x8.                                                    */
129 /*****************************************************************************/
130 
131 #define UPD_RES_PT_NPU_BEST1 hme_update_results_grid_pu_bestn
132 #define UPD_RES_PT_NPU_BESTN hme_update_results_grid_pu_bestn
133 #define UPD_RES_PT_PU_BEST1 hme_update_results_grid_pu_bestn
134 #define UPD_RES_PT_PU_BESTN hme_update_results_grid_pu_bestn
135 #define UPD_RES_GRID_NPU_BEST1 hme_update_results_grid_pu_bestn
136 #define UPD_RES_GRID_NPU_BESTN hme_update_results_grid_pu_bestn
137 #define UPD_RES_GRID_PU_BEST1 hme_update_results_grid_pu_bestn
138 #define UPD_RES_GRID_PU_BESTN hme_update_results_grid_pu_bestn
139 
140 /*******************************************************************************
141 *                         FUNCTION DEFINITIONS
142 *******************************************************************************/
hme_cmp_nodes(search_node_t * ps_best_node1,search_node_t * ps_best_node2)143 S32 hme_cmp_nodes(search_node_t *ps_best_node1, search_node_t *ps_best_node2)
144 {
145     if((ps_best_node1->s_mv.i2_mvx == ps_best_node2->s_mv.i2_mvx) &&
146        (ps_best_node1->s_mv.i2_mvy == ps_best_node2->s_mv.i2_mvy) &&
147        (ps_best_node1->i1_ref_idx == ps_best_node2->i1_ref_idx))
148     {
149         return 0;
150     }
151     return -1;
152 }
153 
compute_4x4_sads_for_16x16_blk(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,UWORD16 ** u2_part_sads,cand_t * ps_cand,WORD32 * num_cands)154 void compute_4x4_sads_for_16x16_blk(
155     grid_ctxt_t *ps_grid, /* Grid ctxt */
156     UWORD8 *pu1_cur_ptr, /* Pointer to top-left of current block */
157     WORD32 cur_buf_stride, /* Buffer stride of current buffer */
158     UWORD16 **
159         u2_part_sads, /* 2D Array containing SADs for all 17 partitions. As many rows as partitions. SADs in a row correspond to each of the candidates */
160     cand_t *ps_cand, /* Return the list of candidates evaluated */
161     WORD32 *num_cands /* Number of candidates that were processed */
162 )
163 {
164     WORD32 a, b, c, d, i;
165     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
166     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
167     //WORD32 offset_x[9] = {-grd_sz_x, 0, grd_sz_x, -grd_sz_x, 0, grd_sz_x, grd_sz_x, 0, -grd_sz_x};
168     //WORD32 offset_y[9] = {-grd_sz_y, -grd_sz_y, -grd_sz_y, 0, 0, 0, grd_sz_y, grd_sz_y, grd_sz_y};
169     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
170     WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
171     WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
172     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
173     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
174     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
175     cand_t *cand0 = ps_cand;
176     UWORD16 au2_4x4_sad[NUM_4X4];
177 
178     *num_cands = 0;
179 
180     /* Loop to fill up the cand_t array and to calculate num_cands */
181     for(i = 0; i < ps_grid->num_grids; i++)
182     {
183         WORD32 j;
184         WORD32 mask = ps_grid->pi4_grd_mask[i];
185         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
186         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
187         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
188 
189         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
190         {
191             if(mask & 1)
192             {
193                 *num_cands = *num_cands + 1;
194                 cand0->grid_ix = i;
195                 cand0->ref_idx = ps_grid->p_ref_idx[i];
196                 cand0->pu1_ref_ptr =
197                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
198                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
199                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
200                 cand0++;
201             }
202         }
203     }
204 
205     /* Loop to compute the SAD's */
206     for(a = 0; a < *num_cands; a++)
207     {
208         cand_t *cand = ps_cand + a;
209         memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
210         for(b = 0; b < NUM_4X4; b++)
211         {
212             WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
213             WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
214 
215             for(c = 0; c < NUM_ROWS_IN_4X4; c++)
216             {
217                 WORD32 z_cur = (cur_buf_stride)*c + t1;
218                 WORD32 z_ref = (ref_buf_stride)*c + t2;
219                 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
220                 {
221                     au2_4x4_sad[b] += (UWORD16)ABS(
222                         (((S32)cand->pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
223                 }
224             }
225         }
226 
227         u2_part_sads[PART_ID_NxN_TL][a] =
228             (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
229         u2_part_sads[PART_ID_NxN_TR][a] =
230             (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
231         u2_part_sads[PART_ID_NxN_BL][a] =
232             (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
233         u2_part_sads[PART_ID_NxN_BR][a] =
234             (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
235         u2_part_sads[PART_ID_Nx2N_L][a] =
236             u2_part_sads[PART_ID_NxN_TL][a] + u2_part_sads[PART_ID_NxN_BL][a];
237         u2_part_sads[PART_ID_Nx2N_R][a] =
238             u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_BR][a];
239         u2_part_sads[PART_ID_2NxN_T][a] =
240             u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_TL][a];
241         u2_part_sads[PART_ID_2NxN_B][a] =
242             u2_part_sads[PART_ID_NxN_BR][a] + u2_part_sads[PART_ID_NxN_BL][a];
243         u2_part_sads[PART_ID_nLx2N_L][a] =
244             (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
245         u2_part_sads[PART_ID_nRx2N_R][a] =
246             (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
247         u2_part_sads[PART_ID_2NxnU_T][a] =
248             (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
249         u2_part_sads[PART_ID_2NxnD_B][a] =
250             (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
251         u2_part_sads[PART_ID_2Nx2N][a] =
252             u2_part_sads[PART_ID_2NxN_T][a] + u2_part_sads[PART_ID_2NxN_B][a];
253         u2_part_sads[PART_ID_2NxnU_B][a] =
254             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnU_T][a];
255         u2_part_sads[PART_ID_2NxnD_T][a] =
256             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnD_B][a];
257         u2_part_sads[PART_ID_nRx2N_L][a] =
258             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nRx2N_R][a];
259         u2_part_sads[PART_ID_nLx2N_R][a] =
260             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nLx2N_L][a];
261     }
262 }
263 
264 /**
265 ********************************************************************************
266 *  @fn     compute_part_sads_for_MxM_blk(grid_ctxt_t *ps_grid,
267 *                                       UWORD8      *pu1_cur_ptr,
268 *                                       WORD32      cur_buf_stride,
269 *                                       WORD32     **pi4_part_sads,
270 *                                       cand_t      *ps_cand,
271 *                                       WORD32      *num_cands
272 *
273 *  @brief  Computes partial SADs and updates partition results for an MxM blk
274 *          and does so for several grids of points. This can be used for
275 *          32x32/64x64 blks with 17 partition updates
276 *
277 *
278 *  @param[in]  ps_grid : Pointer to grid ctxt that has multiple grid of max
279 *                        9 pts per grid
280 *
281 *  @param[in]  pu1_cur_ptr : Top left of input buffer
282 *
283 *  @param[in]  pi4_part_sads : array of pointers, each entry pointing to
284 *                             results to be updated for a given partition
285 *
286 *  @return   The ps_search_results structure has the best result updated for
287 *            the 2Nx2N partition alone.
288 
289 ********************************************************************************
290 */
compute_part_sads_for_MxM_blk(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,WORD32 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands,CU_SIZE_T e_cu_size)291 void compute_part_sads_for_MxM_blk(
292     grid_ctxt_t *ps_grid,
293     UWORD8 *pu1_cur_ptr,
294     WORD32 cur_buf_stride,
295     WORD32 **pp_part_sads,
296     cand_t *ps_cand,
297     WORD32 *num_cands,
298     CU_SIZE_T e_cu_size)
299 {
300     WORD32 a, b, c, d, i;
301     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
302     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
303 
304     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
305     WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
306     WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
307     WORD32 shift = (WORD32)e_cu_size;
308 
309     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
310     WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
311     WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
312     /* Num rows and pixels per row: 8 for CU_32x32 and 16 for CU_64x64 */
313     WORD32 num_rows_in_nxn = 2 << shift;
314     WORD32 num_pixels_in_row = 2 << shift;
315     cand_t *cand0 = ps_cand;
316     /* for a 2Nx2N partition we evaluate nxn SADs, where n = N/2. This is */
317     /* needed for AMP cases.                                              */
318     WORD32 a_nxn_sad[NUM_4X4];
319     *num_cands = 0;
320 
321     /* Loop to fill up the cand_t array and to calculate num_cands */
322     for(i = 0; i < ps_grid->num_grids; i++)
323     {
324         WORD32 j;
325         WORD32 mask = ps_grid->pi4_grd_mask[i];
326         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
327         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
328         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
329 
330         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
331         {
332             if(mask & 1)
333             {
334                 *num_cands = *num_cands + 1;
335                 cand0->grid_ix = i;
336                 cand0->ref_idx = ps_grid->p_ref_idx[i];
337                 cand0->pu1_ref_ptr =
338                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
339                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
340                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
341                 cand0++;
342             }
343         }
344     }
345 
346     /* Loop to compute the SAD's */
347     for(a = 0; a < *num_cands; a++)
348     {
349         cand_t *cand = ps_cand + a;
350         memset(&a_nxn_sad[0], 0, NUM_4X4 * sizeof(WORD32));
351         for(b = 0; b < NUM_4X4; b++)
352         {
353             WORD32 t1 = (b % 4) * num_pixels_in_row + (b >> 2) * cur_buf_stride_lsN;
354             WORD32 t2 = (b % 4) * num_pixels_in_row + (b >> 2) * ref_buf_stride_lsN;
355 
356             for(c = 0; c < num_rows_in_nxn; c++)
357             {
358                 WORD32 z_cur = (cur_buf_stride)*c + t1;
359                 WORD32 z_ref = (ref_buf_stride)*c + t2;
360                 for(d = 0; d < num_pixels_in_row; d++)
361                 {
362                     a_nxn_sad[b] += (WORD32)ABS(
363                         (((WORD32)cand->pu1_ref_ptr[(z_ref + d)]) -
364                          ((WORD32)pu1_cur_ptr[(z_cur + d)])));
365                 }
366             }
367         }
368 
369         pp_part_sads[PART_ID_NxN_TL][a] =
370             (a_nxn_sad[0] + a_nxn_sad[1] + a_nxn_sad[4] + a_nxn_sad[5]);
371         pp_part_sads[PART_ID_NxN_TR][a] =
372             (a_nxn_sad[2] + a_nxn_sad[3] + a_nxn_sad[6] + a_nxn_sad[7]);
373         pp_part_sads[PART_ID_NxN_BL][a] =
374             (a_nxn_sad[8] + a_nxn_sad[9] + a_nxn_sad[12] + a_nxn_sad[13]);
375         pp_part_sads[PART_ID_NxN_BR][a] =
376             (a_nxn_sad[10] + a_nxn_sad[11] + a_nxn_sad[14] + a_nxn_sad[15]);
377         pp_part_sads[PART_ID_Nx2N_L][a] =
378             pp_part_sads[PART_ID_NxN_TL][a] + pp_part_sads[PART_ID_NxN_BL][a];
379         pp_part_sads[PART_ID_Nx2N_R][a] =
380             pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_BR][a];
381         pp_part_sads[PART_ID_2NxN_T][a] =
382             pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_TL][a];
383         pp_part_sads[PART_ID_2NxN_B][a] =
384             pp_part_sads[PART_ID_NxN_BR][a] + pp_part_sads[PART_ID_NxN_BL][a];
385         pp_part_sads[PART_ID_nLx2N_L][a] =
386             (a_nxn_sad[8] + a_nxn_sad[0] + a_nxn_sad[12] + a_nxn_sad[4]);
387         pp_part_sads[PART_ID_nRx2N_R][a] =
388             (a_nxn_sad[3] + a_nxn_sad[7] + a_nxn_sad[15] + a_nxn_sad[11]);
389         pp_part_sads[PART_ID_2NxnU_T][a] =
390             (a_nxn_sad[1] + a_nxn_sad[0] + a_nxn_sad[2] + a_nxn_sad[3]);
391         pp_part_sads[PART_ID_2NxnD_B][a] =
392             (a_nxn_sad[15] + a_nxn_sad[14] + a_nxn_sad[12] + a_nxn_sad[13]);
393         pp_part_sads[PART_ID_2Nx2N][a] =
394             pp_part_sads[PART_ID_2NxN_T][a] + pp_part_sads[PART_ID_2NxN_B][a];
395         pp_part_sads[PART_ID_2NxnU_B][a] =
396             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnU_T][a];
397         pp_part_sads[PART_ID_2NxnD_T][a] =
398             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnD_B][a];
399         pp_part_sads[PART_ID_nRx2N_L][a] =
400             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nRx2N_R][a];
401         pp_part_sads[PART_ID_nLx2N_R][a] =
402             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nLx2N_L][a];
403     }
404 }
405 
hme_evalsad_grid_pu_16x16(err_prms_t * ps_prms)406 void hme_evalsad_grid_pu_16x16(err_prms_t *ps_prms)
407 {
408     grid_ctxt_t s_grid;
409     cand_t as_candt[9];
410     U16 au2_sad_grid[TOT_NUM_PARTS * 9];
411     U16 *apu2_sad_grid[TOT_NUM_PARTS];
412     hme_mv_t s_mv = { 0, 0 };
413     S32 i4_ref_idx = 0, i;
414     S32 num_candts = 0;
415     s_grid.num_grids = 1;
416     s_grid.ref_buf_stride = ps_prms->i4_ref_stride;
417     s_grid.grd_sz_y_x = ((ps_prms->i4_step << 16) | ps_prms->i4_step);
418     s_grid.ppu1_ref_ptr = &ps_prms->pu1_ref;
419     s_grid.pi4_grd_mask = &ps_prms->i4_grid_mask;
420     s_grid.p_mv = &s_mv;
421     s_grid.p_ref_idx = &i4_ref_idx;
422     for(i = 0; i < 9; i++)
423     {
424         if(s_grid.pi4_grd_mask[0] & (1 << i))
425             num_candts++;
426     }
427 
428     for(i = 0; i < TOT_NUM_PARTS; i++)
429         apu2_sad_grid[i] = &au2_sad_grid[i * num_candts];
430 
431     compute_4x4_sads_for_16x16_blk(
432         &s_grid, ps_prms->pu1_inp, ps_prms->i4_inp_stride, apu2_sad_grid, as_candt, &num_candts);
433     for(i = 0; i < TOT_NUM_PARTS * num_candts; i++)
434     {
435         ps_prms->pi4_sad_grid[i] = au2_sad_grid[i];
436     }
437 }
438 
hme_evalsad_grid_npu_MxN(err_prms_t * ps_prms)439 void hme_evalsad_grid_npu_MxN(err_prms_t *ps_prms)
440 {
441     U08 *pu1_inp_base, *pu1_ref_c;
442     S32 *pi4_sad = ps_prms->pi4_sad_grid;
443     S32 i, grid_count = 0;
444     S32 step = ps_prms->i4_step;
445     S32 x_off = step, y_off = step * ps_prms->i4_ref_stride;
446 
447     ASSERT((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
448 
449     //assert(ps_prms->i4_blk_ht <= 8);
450     //assert(ps_prms->i4_blk_wd <= 8);
451     for(i = 0; i < 9; i++)
452     {
453         if(ps_prms->i4_grid_mask & (1 << i))
454             grid_count++;
455     }
456     pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
457 
458     pu1_inp_base = ps_prms->pu1_inp;
459     pu1_ref_c = ps_prms->pu1_ref;
460     for(i = 0; i < 9; i++)
461     {
462         S32 sad = 0, j, k;
463         U08 *pu1_inp, *pu1_ref;
464 
465         if(!(ps_prms->i4_grid_mask & (1 << i)))
466             continue;
467         pu1_ref = pu1_ref_c + x_off * gai1_grid_id_to_x[i];
468         pu1_ref += y_off * gai1_grid_id_to_y[i];
469         pu1_inp = pu1_inp_base;
470 
471         for(j = 0; j < ps_prms->i4_blk_ht; j++)
472         {
473             for(k = 0; k < ps_prms->i4_blk_wd; k++)
474             {
475                 sad += (ABS((pu1_inp[k] - pu1_ref[k])));
476             }
477             pu1_inp += ps_prms->i4_inp_stride;
478             pu1_ref += ps_prms->i4_ref_stride;
479         }
480         *pi4_sad++ = sad;
481     }
482 }
483 
hme_evalsad_pt_npu_MxN_8bit_compute(WORD32 ht,WORD32 wd,UWORD8 * pu1_inp,UWORD8 * pu1_ref,WORD32 i4_inp_stride,WORD32 i4_ref_stride)484 WORD32 hme_evalsad_pt_npu_MxN_8bit_compute(
485     WORD32 ht,
486     WORD32 wd,
487     UWORD8 *pu1_inp,
488     UWORD8 *pu1_ref,
489     WORD32 i4_inp_stride,
490     WORD32 i4_ref_stride)
491 {
492     WORD32 i, j;
493     WORD32 sad = 0;
494     for(i = 0; i < ht; i++)
495     {
496         for(j = 0; j < wd; j++)
497         {
498             sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
499         }
500         pu1_inp += i4_inp_stride;
501         pu1_ref += i4_ref_stride;
502     }
503     return sad;
504 }
505 
hme_evalsad_pt_npu_MxN_8bit(err_prms_t * ps_prms)506 void hme_evalsad_pt_npu_MxN_8bit(err_prms_t *ps_prms)
507 {
508     S32 wd, ht;
509     U08 *pu1_inp, *pu1_ref;
510 
511     wd = ps_prms->i4_blk_wd;
512     ht = ps_prms->i4_blk_ht;
513 
514     pu1_inp = ps_prms->pu1_inp;
515     pu1_ref = ps_prms->pu1_ref;
516 
517     ps_prms->pi4_sad_grid[0] = hme_evalsad_pt_npu_MxN_8bit_compute(
518         ht, wd, pu1_inp, pu1_ref, ps_prms->i4_inp_stride, ps_prms->i4_ref_stride);
519 }
520 
compute_satd_8bit(err_prms_t * ps_prms)521 void compute_satd_8bit(err_prms_t *ps_prms)
522 {
523     U08 *pu1_origin;
524     S32 src_strd;
525     U08 *pu1_pred_buf;
526     S32 dst_strd;
527     S32 wd, ht;
528     U32 u4_sad = 0;
529     WORD32 x, y;
530     U08 *u1_pi0, *u1_pi1;
531 
532     pu1_origin = ps_prms->pu1_inp;
533     pu1_pred_buf = ps_prms->pu1_ref;
534     src_strd = ps_prms->i4_inp_stride;
535     dst_strd = ps_prms->i4_ref_stride;
536     wd = ps_prms->i4_blk_wd;
537     ht = ps_prms->i4_blk_ht;
538 
539     u1_pi0 = pu1_origin;
540     u1_pi1 = pu1_pred_buf;
541 
542     /* Follows the following logic:
543     For block sizes less than or equal to 16X16, the basic transform size is 4x4
544     For block sizes greater than or equal to 32x32, the basic transform size is 8x8 */
545     if((wd > 0x10) || (ht > 0x10))
546     {
547         for(y = 0; y < ht; y += 8)
548         {
549             for(x = 0; x < wd; x += 8)
550             {
551                 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
552                     &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
553             }
554             u1_pi0 += src_strd * 8;
555             u1_pi1 += dst_strd * 8;
556         }
557     }
558     else
559     {
560         for(y = 0; y < ht; y += 4)
561         {
562             for(x = 0; x < wd; x += 4)
563             {
564                 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
565                     &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
566             }
567             u1_pi0 += src_strd * 4;
568             u1_pi1 += dst_strd * 4;
569         }
570     }
571 
572     ps_prms->pi4_sad_grid[0] = (S32)u4_sad;
573 }
574 
hme_init_pred_part(pred_ctxt_t * ps_pred_ctxt,search_node_t * ps_tl,search_node_t * ps_t,search_node_t * ps_tr,search_node_t * ps_l,search_node_t * ps_bl,search_node_t * ps_coloc,search_node_t * ps_zeromv,search_node_t ** pps_proj_coloc,PART_ID_T e_part_id)575 void hme_init_pred_part(
576     pred_ctxt_t *ps_pred_ctxt,
577     search_node_t *ps_tl,
578     search_node_t *ps_t,
579     search_node_t *ps_tr,
580     search_node_t *ps_l,
581     search_node_t *ps_bl,
582     search_node_t *ps_coloc,
583     search_node_t *ps_zeromv,
584     search_node_t **pps_proj_coloc,
585     PART_ID_T e_part_id)
586 {
587     pred_candt_nodes_t *ps_candt_nodes;
588 
589     ps_candt_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
590 
591     ps_candt_nodes->ps_tl = ps_tl;
592     ps_candt_nodes->ps_tr = ps_tr;
593     ps_candt_nodes->ps_t = ps_t;
594     ps_candt_nodes->ps_l = ps_l;
595     ps_candt_nodes->ps_bl = ps_bl;
596     ps_candt_nodes->ps_coloc = ps_coloc;
597     ps_candt_nodes->ps_zeromv = ps_zeromv;
598     ps_candt_nodes->pps_proj_coloc = pps_proj_coloc;
599 }
600 
hme_init_pred_ctxt_no_encode(pred_ctxt_t * ps_pred_ctxt,search_results_t * ps_search_results,search_node_t * ps_top_candts,search_node_t * ps_left_candts,search_node_t ** pps_proj_coloc_candts,search_node_t * ps_coloc_candts,search_node_t * ps_zeromv_candt,S32 pred_lx,S32 lambda,S32 lambda_q_shift,U08 ** ppu1_ref_bits_tlu,S16 * pi2_ref_scf)601 void hme_init_pred_ctxt_no_encode(
602     pred_ctxt_t *ps_pred_ctxt,
603     search_results_t *ps_search_results,
604     search_node_t *ps_top_candts,
605     search_node_t *ps_left_candts,
606     search_node_t **pps_proj_coloc_candts,
607     search_node_t *ps_coloc_candts,
608     search_node_t *ps_zeromv_candt,
609     S32 pred_lx,
610     S32 lambda,
611     S32 lambda_q_shift,
612     U08 **ppu1_ref_bits_tlu,
613     S16 *pi2_ref_scf)
614 {
615     search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
616     search_node_t *ps_coloc;
617     PART_ID_T e_part_id;
618 
619     /* Assume that resolution is subpel to begin with */
620     ps_pred_ctxt->mv_pel = 0;  // FPEL
621 
622     /* lambda and pred_lx (PRED_L0/PRED_L1) */
623     ps_pred_ctxt->lambda = lambda;
624     ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
625     ps_pred_ctxt->pred_lx = pred_lx;
626     ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
627     ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
628     ps_pred_ctxt->proj_used = 0;
629 
630     /* Bottom left should not be valid */
631     ASSERT(ps_left_candts[2].u1_is_avail == 0);
632     ps_invalid = &ps_left_candts[2];
633 
634     /*************************************************************************/
635     /* for the case of no encode, the idea is to set up cants as follows     */
636     /*                                                                       */
637     /*    ____ ______________                                                */
638     /*   | TL | T  | T1 | TR |                                               */
639     /*   |____|____|____|____|                                               */
640     /*   | L  | b0 | b1 |                                                    */
641     /*   |____|____|____|                                                    */
642     /*   | L1 | b2 | b3 |                                                    */
643     /*   |____|____|____|                                                    */
644     /*   | BL |                                                              */
645     /*   |____|                                                              */
646     /*                                                                       */
647     /*  If use_4x4 is 0, then b0,b1,b2,b3 are single 8x8 blk. then T=T1      */
648     /* and L=L1. topleft, top and topright are TL,T,TR respectively          */
649     /* Left and bottom left is L and BL respectively.                        */
650     /* If use_4x4 is 1: then the above holds true only for PARTID = 0 (8x8)  */
651     /*  For the 4 subblocks (partids 4-7)                                    */
652     /*                                                                       */
653     /*  Block   Left   Top   Top Left   Top Right   Bottom Left             */
654     /*    b0    L      T      TL          T1          L1                     */
655     /*    b1    b0     T1     T           TR          BL(invalid)            */
656     /*    b2    L1     b0     L0          b1          BL (invalid)           */
657     /*    b3    b2     b1     b0          BL(inv)     BL (inv)               */
658     /*                                                                       */
659     /* Note : For block b1, bottom left pts to b2, which is not yet ready    */
660     /*  hence it is kept invalid and made to pt to BL. For block b3 top rt   */
661     /* is invalid and hence made to pt to BL which is invalid.               */
662     /* BL is invalid since it lies in a bottom left 8x8 blk and not yet ready*/
663     /*************************************************************************/
664 
665     /* ps_coloc always points to a fixe candt (global) */
666     /* TODO : replace incoming ps_coloc from global to geniune coloc */
667     ps_coloc = ps_coloc_candts;
668 
669     /* INITIALIZATION OF 8x8 BLK */
670     ps_tl = ps_top_candts;
671     ps_t = ps_tl + 2;
672     ps_tr = ps_t + 1;
673     ps_l = ps_left_candts + 1;
674     ps_bl = ps_invalid;
675     e_part_id = PART_ID_2Nx2N;
676     hme_init_pred_part(
677         ps_pred_ctxt,
678         ps_tl,
679         ps_t,
680         ps_tr,
681         ps_l,
682         ps_bl,
683         ps_coloc,
684         ps_zeromv_candt,
685         pps_proj_coloc_candts,
686         e_part_id);
687 
688     /* INITIALIZATION OF 4x4 TL BLK */
689     e_part_id = PART_ID_NxN_TL;
690     ps_tl = ps_top_candts;
691     ps_t = ps_tl + 1;
692     ps_tr = ps_t + 1;
693     ps_l = ps_left_candts;
694     ps_bl = ps_l + 1;
695     hme_init_pred_part(
696         ps_pred_ctxt,
697         ps_tl,
698         ps_t,
699         ps_tr,
700         ps_l,
701         ps_bl,
702         ps_coloc,
703         ps_zeromv_candt,
704         pps_proj_coloc_candts,
705         e_part_id);
706 
707     /* INITIALIZATION OF 4x4 TR BLK */
708     e_part_id = PART_ID_NxN_TR;
709     ps_tl = ps_top_candts + 1;
710     ps_t = ps_tl + 1;
711     ps_tr = ps_t + 1;
712     ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
713     ps_bl = ps_invalid;
714     hme_init_pred_part(
715         ps_pred_ctxt,
716         ps_tl,
717         ps_t,
718         ps_tr,
719         ps_l,
720         ps_bl,
721         ps_coloc,
722         ps_zeromv_candt,
723         pps_proj_coloc_candts,
724         e_part_id);
725 
726     /* INITIALIZATION OF 4x4 BL BLK */
727     e_part_id = PART_ID_NxN_BL;
728     ps_tl = ps_left_candts;
729     ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
730     ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
731     ps_l = ps_left_candts + 1;
732     ps_bl = ps_invalid;  //invalid
733     hme_init_pred_part(
734         ps_pred_ctxt,
735         ps_tl,
736         ps_t,
737         ps_tr,
738         ps_l,
739         ps_bl,
740         ps_coloc,
741         ps_zeromv_candt,
742         pps_proj_coloc_candts,
743         e_part_id);
744 
745     /* INITIALIZATION OF 4x4 BR BLK */
746     e_part_id = PART_ID_NxN_BR;
747     ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
748     ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
749     ps_tr = ps_invalid;  // invalid
750     ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
751     ps_bl = ps_invalid;  // invalid
752     hme_init_pred_part(
753         ps_pred_ctxt,
754         ps_tl,
755         ps_t,
756         ps_tr,
757         ps_l,
758         ps_bl,
759         ps_coloc,
760         ps_zeromv_candt,
761         pps_proj_coloc_candts,
762         e_part_id);
763 }
764 
hme_init_pred_ctxt_encode(pred_ctxt_t * ps_pred_ctxt,search_results_t * ps_search_results,search_node_t * ps_coloc_candts,search_node_t * ps_zeromv_candt,mv_grid_t * ps_mv_grid,S32 pred_lx,S32 lambda,S32 lambda_q_shift,U08 ** ppu1_ref_bits_tlu,S16 * pi2_ref_scf)765 void hme_init_pred_ctxt_encode(
766     pred_ctxt_t *ps_pred_ctxt,
767     search_results_t *ps_search_results,
768     search_node_t *ps_coloc_candts,
769     search_node_t *ps_zeromv_candt,
770     mv_grid_t *ps_mv_grid,
771     S32 pred_lx,
772     S32 lambda,
773     S32 lambda_q_shift,
774     U08 **ppu1_ref_bits_tlu,
775     S16 *pi2_ref_scf)
776 {
777     search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
778     search_node_t *ps_coloc;
779     search_node_t *ps_grid_cu_base;
780     CU_SIZE_T e_cu_size = ps_search_results->e_cu_size;
781 
782     /* Part Start, Part sizes in 4x4 units */
783     S32 part_wd, part_ht, part_start_x, part_start_y;
784 
785     /* Partition type, number of partitions in type */
786     S32 part_id;
787 
788     /* Coordinates of the CU in 4x4 units */
789     S32 cu_start_x, cu_start_y;
790     S32 shift = e_cu_size;
791 
792     /* top right and bot left validity at CU level */
793     S32 cu_tr_valid, cu_bl_valid;
794     /* strideo f the grid */
795     S32 grid_stride = ps_mv_grid->i4_stride;
796 
797     ps_pred_ctxt->lambda = lambda;
798     ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
799     ps_pred_ctxt->pred_lx = pred_lx;
800     ps_pred_ctxt->mv_pel = 0;
801     ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
802     ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
803     ps_pred_ctxt->proj_used = 1;
804 
805     cu_start_x = ps_search_results->u1_x_off >> 2;
806     cu_start_y = ps_search_results->u1_y_off >> 2;
807 
808     /* Coloc always points to fixed global candt */
809     ps_coloc = ps_coloc_candts;
810 
811     /* Go to base of the CU in the MV Grid */
812     ps_grid_cu_base = &ps_mv_grid->as_node[0];
813     ps_grid_cu_base += (ps_mv_grid->i4_start_offset + cu_start_x);
814     ps_grid_cu_base += (grid_stride * cu_start_y);
815 
816     /* points to the real bottom left of the grid, will never be valid */
817     ps_invalid = &ps_mv_grid->as_node[0];
818     ps_invalid += (grid_stride * 17);
819 
820     {
821         S32 shift = 1 + e_cu_size;
822         cu_tr_valid = gau1_cu_tr_valid[cu_start_y >> shift][cu_start_x >> shift];
823         cu_bl_valid = gau1_cu_bl_valid[cu_start_y >> shift][cu_start_x >> shift];
824     }
825 
826     /*************************************************************************/
827     /* for the case of    encode, the idea is to set up cants as follows     */
828     /*                                                                       */
829     /*    ____ ______________ ____ ____                                      */
830     /*   | T0 | T1 | T2 | T3 | T4 | T5 |                                     */
831     /*   |____|____|____|____|____|____|                                     */
832     /*   | L1 |    |              |                                          */
833     /*   |____|    |              |                                          */
834     /*   | L2 | p0 |     p1       |                                          */
835     /*   |____|    |              |                                          */
836     /*   | L3 |    |              |                                          */
837     /*   |____|    |              |                                          */
838     /*   | L4 | L' |              |                                          */
839     /*   |____|____|______________|                                          */
840     /*   | BL |                                                              */
841     /*   |____|                                                              */
842     /*  The example is shown with 16x16 CU, though it can be generalized     */
843     /*  This CU has 2 partitions, cu_wd = 4. also p_wd, p_ht are partition   */
844     /*  width and ht in 4x4 units.                                           */
845     /*  For a given CU, derive the top left, top and bottom left and top rt  */
846     /*  pts. Left and top are assumed to be valid.                           */
847     /*  IF there aretwo partitions in the CU (like p0 and p1) and vertical,  */
848     /*  then for first partition, left, top, top left and top right valid    */
849     /*  Bottom left is valid. store these validity flags. Also store the     */
850     /*  grid offsets of the partitions w.r.t. CU start in units of 4x4.For p0*/
851     /*  Left grid offset = -1, 3. Top Grd offset = -1, 0.                    */
852     /*  Top left grid offset = -1, -1. Top right = 1, -1. BL = -1, 4.        */
853     /*  For p1, validity flags are left, top, top left, top right, valid.    */
854     /*  BL is invalid. Grid offsets are: Left = dont care. T = 1, -1 (T2)    */
855     /*  TR = 4, -1 (T5). TL = 0, -1 (T1). BL = don't care.                   */
856     /*  For p1, set the left pred candt to the best search result of p0.     */
857     /*************************************************************************/
858 
859     /* Loop over all partitions, and identify the 5 neighbours */
860     for(part_id = 0; part_id < TOT_NUM_PARTS; part_id++)
861     {
862         part_attr_t *ps_part_attr = &gas_part_attr_in_cu[part_id];
863         S32 tr_valid, bl_valid, is_vert;
864         search_node_t *ps_grid_pu_base;
865         PART_TYPE_T e_part_type;
866         PART_ID_T first_part;
867         S32 part_num;
868 
869         e_part_type = ge_part_id_to_part_type[part_id];
870         first_part = ge_part_type_to_part_id[e_part_type][0];
871         is_vert = gau1_is_vert_part[e_part_type];
872         part_num = gau1_part_id_to_part_num[part_id];
873         tr_valid = gau1_partid_tr_valid[part_id] & cu_tr_valid;
874         bl_valid = gau1_partid_bl_valid[part_id] & cu_bl_valid;
875 
876         part_start_x = (ps_part_attr->u1_x_start << shift) >> 2;
877         part_start_y = (ps_part_attr->u1_y_start << shift) >> 2;
878         part_wd = (ps_part_attr->u1_x_count << shift) >> 2;
879         part_ht = (ps_part_attr->u1_y_count << shift) >> 2;
880 
881         /* go to top left of part */
882         ps_grid_pu_base = ps_grid_cu_base + part_start_x;
883         ps_grid_pu_base += (part_start_y * grid_stride);
884 
885         ps_tl = ps_grid_pu_base - 1 - grid_stride;
886         ps_t = ps_grid_pu_base - grid_stride + part_wd - 1;
887         ps_l = ps_grid_pu_base - 1 + ((part_ht - 1) * grid_stride);
888         ps_tr = ps_t + 1;
889         ps_bl = ps_l + grid_stride;
890 
891         if(!tr_valid)
892             ps_tr = ps_invalid;
893         if(!bl_valid)
894             ps_bl = ps_invalid;
895 
896         if(part_num == 1)
897         {
898             /* for cases of two partitions 2nd part has 1st part as candt */
899             /* if vertical type, left candt of 2nd part is 1st part.      */
900             /* if horz type, top candt of 2nd part is 1st part.           */
901             if(is_vert)
902             {
903                 ps_l = ps_search_results->aps_part_results[pred_lx][first_part];
904             }
905             else
906             {
907                 ps_t = ps_search_results->aps_part_results[pred_lx][first_part];
908             }
909         }
910         if(part_num == 2)
911         {
912             /* only possible for NxN_BL */
913             ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
914             ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
915         }
916         if(part_num == 3)
917         {
918             /* only possible for NxN_BR */
919             ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
920             ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
921             ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
922         }
923         hme_init_pred_part(
924             ps_pred_ctxt,
925             ps_tl,
926             ps_t,
927             ps_tr,
928             ps_l,
929             ps_bl,
930             ps_coloc,
931             ps_zeromv_candt,
932             NULL,
933             (PART_ID_T)part_id);
934     }
935 }
936 
937 /**
938 ********************************************************************************
939 *  @fn     compute_mv_cost_explicit(search_node_t *ps_node,
940 *                   pred_ctxt_t *ps_pred_ctxt,
941 *                   PART_ID_T e_part_id)
942 *
943 *  @brief  MV cost for explicit search in layers not encoded
944 *
945 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
946 *
947 *  @param[in]  ps_pred_ctxt : mv pred context
948 *
949 *  @param[in]  e_part_id : Partition id.
950 *
951 *  @return   Cost value
952 
953 ********************************************************************************
954 */
compute_mv_cost_explicit(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)955 S32 compute_mv_cost_explicit(
956     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
957 {
958 #define RETURN_FIXED_COST 0
959     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
960     pred_candt_nodes_t *ps_pred_nodes;
961     S32 inp_shift = 2 - inp_mv_pel;
962     S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
963     S32 mv_p_x, mv_p_y;
964     S16 mvdx1, mvdx2, mvdy1, mvdy2;
965     S32 cost, ref_bits;
966 
967     /*************************************************************************/
968     /* Logic for cost computation for explicit search. For such a search,    */
969     /* it is guaranteed that all predictor candts have same ref id. The only */
970     /* probable issue is with the availability which needs checking. This fxn*/
971     /* does not suffer the need to scale predictor candts due to diff ref id */
972     /*************************************************************************/
973 
974     /* Hack: currently we always assume 2Nx2N. */
975     /* TODO: get rid of this hack and return cost tuned to each partition */
976     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
977     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
978 
979     /*************************************************************************/
980     /* Priority to bottom left availability. Else we go to left. If both are */
981     /* not available, then a remains null                                    */
982     /*************************************************************************/
983     if(ps_pred_nodes->ps_tl->u1_is_avail)
984         ps_pred_node_a = ps_pred_nodes->ps_tl;
985     else if(ps_pred_nodes->ps_l->u1_is_avail)
986         ps_pred_node_a = ps_pred_nodes->ps_l;
987 
988     /*************************************************************************/
989     /* For encoder, top left may not be really needed unless we use slices,  */
990     /* and even then in ME it may not be relevant. So we only consider T or  */
991     /* TR, as, if both T and TR are not available, TL also will not be       */
992     /*************************************************************************/
993     if(ps_pred_nodes->ps_tr->u1_is_avail)
994         ps_pred_node_b = ps_pred_nodes->ps_tr;
995     else if(ps_pred_nodes->ps_t->u1_is_avail)
996         ps_pred_node_b = ps_pred_nodes->ps_t;
997 
998     if(ps_pred_node_a == NULL)
999     {
1000         ps_pred_node_a = ps_pred_nodes->ps_coloc;
1001         if(ps_pred_node_b == NULL)
1002             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1003     }
1004     else if(ps_pred_node_b == NULL)
1005         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1006     else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1007     {
1008         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1009     }
1010 
1011     mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1012     mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1013     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1014     mvdx1 = ABS(mvdx1);
1015     mvdy1 = ABS(mvdy1);
1016 
1017     mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1018     mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1019     COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1020     mvdx2 = ABS(mvdx2);
1021     mvdy2 = ABS(mvdy2);
1022 
1023     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1024     {
1025         cost =
1026             hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1027     }
1028     else
1029     {
1030         cost =
1031             hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1032     }
1033     {
1034         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1035         return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1036     }
1037 }
1038 /**
1039 ********************************************************************************
1040 *  @fn     compute_mv_cost_coarse(search_node_t *ps_node,
1041 *                   pred_ctxt_t *ps_pred_ctxt,
1042 *                   PART_ID_T e_part_id)
1043 *
1044 *  @brief  MV cost for coarse explicit search in coarsest layer
1045 *
1046 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1047 *
1048 *  @param[in]  ps_pred_ctxt : mv pred context
1049 *
1050 *  @param[in]  e_part_id : Partition id.
1051 *
1052 *  @return   Cost value
1053 
1054 ********************************************************************************
1055 */
compute_mv_cost_coarse(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1056 S32 compute_mv_cost_coarse(
1057     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1058 {
1059     ARG_NOT_USED(e_part_id);
1060 
1061     return (compute_mv_cost_explicit(ps_node, ps_pred_ctxt, PART_ID_2Nx2N, inp_mv_pel));
1062 }
1063 
1064 /**
1065 ********************************************************************************
1066 *  @fn     compute_mv_cost_coarse_high_speed(search_node_t *ps_node,
1067 *                                            pred_ctxt_t *ps_pred_ctxt,
1068 *                                            PART_ID_T e_part_id)
1069 *
1070 *  @brief  MV cost for coarse explicit search in coarsest layer
1071 *
1072 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1073 *
1074 *  @param[in]  ps_pred_ctxt : mv pred context
1075 *
1076 *  @param[in]  e_part_id : Partition id.
1077 *
1078 *  @return   Cost value
1079 
1080 ********************************************************************************
1081 */
compute_mv_cost_coarse_high_speed(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1082 S32 compute_mv_cost_coarse_high_speed(
1083     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1084 {
1085     S32 rnd, mvx, mvy, i4_search_idx;
1086     S32 cost;
1087 
1088     mvx = ps_node->s_mv.i2_mvx;
1089     mvy = ps_node->s_mv.i2_mvy;
1090     i4_search_idx = ps_node->i1_ref_idx;
1091 
1092     cost = (2 * hme_get_range(ABS(mvx)) - 1) + (2 * hme_get_range(ABS(mvy)) - 1) + i4_search_idx;
1093     cost += (mvx != 0) ? 1 : 0;
1094     cost += (mvy != 0) ? 1 : 0;
1095     rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1096     cost = (cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift;
1097     return cost;
1098 }
1099 
1100 /**
1101 ********************************************************************************
1102 *  @fn     compute_mv_cost_explicit_refine(search_node_t *ps_node,
1103 *                                          pred_ctxt_t *ps_pred_ctxt,
1104 *                                          PART_ID_T e_part_id)
1105 *
1106 *  @brief  MV cost for explicit search in layers not encoded. Always returns
1107 *          cost of the projected colocated candidate
1108 *
1109 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1110 *
1111 *  @param[in]  ps_pred_ctxt : mv pred context
1112 *
1113 *  @param[in]  e_part_id : Partition id.
1114 *
1115 *  @return   Cost value
1116 
1117 ********************************************************************************
1118 */
compute_mv_cost_explicit_refine(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1119 S32 compute_mv_cost_explicit_refine(
1120     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1121 {
1122     search_node_t *ps_pred_node_a = NULL;
1123     pred_candt_nodes_t *ps_pred_nodes;
1124     S32 inp_shift = 2 - inp_mv_pel;
1125     S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
1126     S32 mv_p_x, mv_p_y;
1127     S16 mvdx1, mvdy1;
1128     S32 cost, ref_bits;
1129 
1130     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1131     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1132 
1133     ps_pred_node_a = ps_pred_nodes->pps_proj_coloc[0];
1134 
1135     mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1136     mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1137     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1138     mvdx1 = ABS(mvdx1);
1139     mvdy1 = ABS(mvdy1);
1140 
1141     cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1142 
1143     {
1144         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1145         return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1146     }
1147 }
1148 
1149 /**
1150 ********************************************************************************
1151 *  @fn     compute_mv_cost_refine(search_node_t *ps_node,
1152 *                   pred_ctxt_t *ps_pred_ctxt,
1153 *                   PART_ID_T e_part_id)
1154 *
1155 *  @brief  MV cost for coarse explicit search in coarsest layer
1156 *
1157 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1158 *
1159 *  @param[in]  ps_pred_ctxt : mv pred context
1160 *
1161 *  @param[in]  e_part_id : Partition id.
1162 *
1163 *  @return   Cost value
1164 
1165 ********************************************************************************
1166 */
compute_mv_cost_refine(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1167 S32 compute_mv_cost_refine(
1168     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1169 {
1170     return (compute_mv_cost_explicit_refine(ps_node, ps_pred_ctxt, e_part_id, inp_mv_pel));
1171 }
1172 
compute_mv_cost_implicit(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1173 S32 compute_mv_cost_implicit(
1174     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1175 {
1176     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1177     pred_candt_nodes_t *ps_pred_nodes;
1178     S08 i1_ref_idx;
1179     S08 i1_ref_tl = -1, i1_ref_tr = -1, i1_ref_t = -1;
1180     S08 i1_ref_bl = -1, i1_ref_l = -1;
1181     S32 inp_shift = 2 - inp_mv_pel;
1182     S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel;*/
1183     S32 ref_bits, cost;
1184     S32 mv_p_x, mv_p_y;
1185     S16 mvdx1, mvdx2, mvdy1, mvdy2;
1186 
1187     //return 0;
1188     i1_ref_idx = ps_node->i1_ref_idx;
1189 
1190     /*************************************************************************/
1191     /* Logic for cost computation for explicit search. For such a search,    */
1192     /* it is guaranteed that all predictor candts have same ref id. The only */
1193     /* probable issue is with the availability which needs checking. This fxn*/
1194     /* does not suffer the need to scale predictor candts due to diff ref id */
1195     /*************************************************************************/
1196 
1197     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1198     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1199 
1200     /*************************************************************************/
1201     /* Priority to bottom left availability. Else we go to left. If both are */
1202     /* not available, then a remains null                                    */
1203     /*************************************************************************/
1204     if(ps_pred_nodes->ps_bl->u1_is_avail)
1205         i1_ref_bl = ps_pred_nodes->ps_bl->i1_ref_idx;
1206     if(ps_pred_nodes->ps_l->u1_is_avail)
1207         i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1208     if(i1_ref_bl == i1_ref_idx)
1209         ps_pred_node_a = ps_pred_nodes->ps_bl;
1210     else if(i1_ref_l == i1_ref_idx)
1211         ps_pred_node_a = ps_pred_nodes->ps_l;
1212     if(ps_pred_node_a == NULL)
1213     {
1214         if(i1_ref_bl != -1)
1215             ps_pred_node_a = ps_pred_nodes->ps_bl;
1216         else if(i1_ref_l != -1)
1217             ps_pred_node_a = ps_pred_nodes->ps_l;
1218     }
1219 
1220     /*************************************************************************/
1221     /* For encoder, top left may not be really needed unless we use slices,  */
1222     /* and even then in ME it may not be relevant. So we only consider T or  */
1223     /* TR, as, if both T and TR are not available, TL also will not be       */
1224     /*************************************************************************/
1225     if(ps_pred_nodes->ps_tr->u1_is_avail)
1226         i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1227     if(ps_pred_nodes->ps_t->u1_is_avail)
1228         i1_ref_t = ps_pred_nodes->ps_t->i1_ref_idx;
1229     if(ps_pred_nodes->ps_tl->u1_is_avail)
1230         i1_ref_tl = ps_pred_nodes->ps_tl->i1_ref_idx;
1231     if(i1_ref_tr == i1_ref_idx)
1232         ps_pred_node_b = ps_pred_nodes->ps_tr;
1233     else if(i1_ref_t == i1_ref_idx)
1234         ps_pred_node_b = ps_pred_nodes->ps_t;
1235     else if(i1_ref_tl == i1_ref_idx)
1236         ps_pred_node_b = ps_pred_nodes->ps_tl;
1237 
1238     if(ps_pred_node_b == NULL)
1239     {
1240         if(i1_ref_tr != -1)
1241             ps_pred_node_b = ps_pred_nodes->ps_tr;
1242         else if(i1_ref_t != -1)
1243             ps_pred_node_b = ps_pred_nodes->ps_t;
1244         else if(i1_ref_tl != -1)
1245             ps_pred_node_b = ps_pred_nodes->ps_tl;
1246     }
1247     if(ps_pred_node_a == NULL)
1248     {
1249         ps_pred_node_a = ps_pred_nodes->ps_coloc;
1250         if(ps_pred_node_b == NULL)
1251             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1252     }
1253     else if(ps_pred_node_b == NULL)
1254         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1255     else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1256     {
1257         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1258     }
1259 
1260     if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1261     {
1262         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1263     }
1264     else
1265     {
1266         mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1267         mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1268     }
1269     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1270     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1271     mvdx1 = ABS(mvdx1);
1272     mvdy1 = ABS(mvdy1);
1273 
1274     if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1275     {
1276         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1277     }
1278     else
1279     {
1280         mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1281         mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1282     }
1283     pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1284     COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1285     mvdx2 = ABS(mvdx2);
1286     mvdy2 = ABS(mvdy2);
1287 
1288     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1289     {
1290         cost = 2 * hme_get_range(mvdx1) + 2 * hme_get_range(mvdy1) + 2 * (mvdx1 > 0) +
1291                2 * (mvdy1 > 0) + ref_bits + 2;
1292     }
1293     else
1294     {
1295         cost = 2 * hme_get_range(mvdx2) + 2 * hme_get_range(mvdy2) + 2 * (mvdx2 > 0) +
1296                2 * (mvdy2 > 0) + ref_bits + 2;
1297     }
1298     {
1299         /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1300         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift);
1301         S32 tot_cost = (cost * ps_pred_ctxt->lambda) << 1;
1302 
1303         tot_cost += (gau1_bits_for_part_id_q1[e_part_id] * ps_pred_ctxt->lambda);
1304         return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift + 1));
1305     }
1306 }
1307 
compute_mv_cost_implicit_high_speed(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1308 S32 compute_mv_cost_implicit_high_speed(
1309     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1310 {
1311     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1312     pred_candt_nodes_t *ps_pred_nodes;
1313     S08 i1_ref_idx;
1314     S08 i1_ref_tr = -1;
1315     S08 i1_ref_l = -1;
1316     S32 inp_shift = 2 - inp_mv_pel;
1317     S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1318     S32 ref_bits, cost;
1319     S32 mv_p_x, mv_p_y;
1320     S16 mvdx1, mvdx2, mvdy1, mvdy2;
1321 
1322     i1_ref_idx = ps_node->i1_ref_idx;
1323 
1324     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1325     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1326 
1327     /*************************************************************************/
1328     /* Priority to bottom left availability. Else we go to left. If both are */
1329     /* not available, then a remains null                                    */
1330     /*************************************************************************/
1331     if(ps_pred_nodes->ps_l->u1_is_avail)
1332     {
1333         i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1334         ps_pred_node_a = ps_pred_nodes->ps_l;
1335     }
1336 
1337     /*************************************************************************/
1338     /* For encoder, top left may not be really needed unless we use slices,  */
1339     /* and even then in ME it may not be relevant. So we only consider T or  */
1340     /* TR, as, if both T and TR are not available, TL also will not be       */
1341     /*************************************************************************/
1342 
1343     if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
1344     {
1345         i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1346         ps_pred_node_b = ps_pred_nodes->ps_tr;
1347     }
1348     else
1349     {
1350         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1351     }
1352 
1353     if(ps_pred_node_a == NULL)
1354     {
1355         ps_pred_node_a = ps_pred_nodes->ps_coloc;
1356 
1357         if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
1358             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1359     }
1360 
1361     if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1362     {
1363         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1364     }
1365     else
1366     {
1367         mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1368         mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1369     }
1370 
1371     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1372     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1373     mvdx1 = ABS(mvdx1);
1374     mvdy1 = ABS(mvdy1);
1375 
1376     if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1377     {
1378         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1379     }
1380     else
1381     {
1382         mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1383         mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1384     }
1385 
1386     pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1387     COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1388     mvdx2 = ABS(mvdx2);
1389     mvdy2 = ABS(mvdy2);
1390 
1391     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1392     {
1393         cost =
1394             hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1395     }
1396     else
1397     {
1398         cost =
1399             hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1400     }
1401     {
1402         /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1403         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1404         S32 tot_cost = (cost * ps_pred_ctxt->lambda);
1405 
1406         return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift));
1407     }
1408 }
1409 
compute_mv_cost_implicit_high_speed_modified(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1410 S32 compute_mv_cost_implicit_high_speed_modified(
1411     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1412 {
1413     search_node_t *ps_pred_node_a = NULL;
1414     pred_candt_nodes_t *ps_pred_nodes;
1415     S32 inp_shift = 2 - inp_mv_pel;
1416     S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1417     S32 mv_p_x, mv_p_y;
1418     S16 mvdx1, mvdy1;
1419     S32 cost, ref_bits;
1420 
1421     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1422     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1423 
1424     ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
1425 
1426     mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1427     mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1428     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1429     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1430     mvdx1 = ABS(mvdx1);
1431     mvdy1 = ABS(mvdy1);
1432 
1433     cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1434 
1435     {
1436         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1437         return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1438     }
1439 }
1440 
hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t * ps_result_prms)1441 void hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t *ps_result_prms)
1442 {
1443     /*The function modified with assumption that only 2NxN_B and Nx2N_R is modified */
1444 
1445     search_node_t s_search_node_grid;
1446     const search_node_t *ps_search_node_base;
1447     search_node_t *ps_search_node_grid, *ps_best_node;
1448     S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1449     S32 num_results, i4_unique_id = -1, i4_grid_pt;
1450     search_results_t *ps_search_results;
1451     S32 *pi4_valid_part_ids;
1452     S32 i4_step = ps_result_prms->i4_step;
1453     S32 i4_grid_mask, i, i4_min_id;
1454     S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1455     S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1456     S32 grid_count = 0;
1457     S32 pred_lx;
1458 
1459     i4_min_id = (S32)PT_C;
1460     i4_min_cost = MAX_32BIT_VAL;
1461     ps_search_node_grid = &s_search_node_grid;
1462     ps_search_node_base = ps_result_prms->ps_search_node_base;
1463     *ps_search_node_grid = *ps_search_node_base;
1464     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1465     ps_search_results = ps_result_prms->ps_search_results;
1466     num_results = (S32)ps_search_results->u1_num_results_per_part;
1467     i4_grid_mask = ps_result_prms->i4_grid_mask;
1468 
1469     for(i = 0; i < 9; i++)
1470     {
1471         if(i4_grid_mask & (1 << i))
1472             grid_count++;
1473     }
1474 
1475     /* Some basic assumptions: only single pt, only part updates */
1476     /* and more than 1 best result to be computed.               */
1477     //ASSERT(ps_result_prms->i4_grid_mask != 1);
1478     //ASSERT(ps_result_prms->i4_part_mask != ENABLE_2Nx2N);
1479     //ASSERT(ps_search_results->num_results > 1);
1480 
1481     i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1482     pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1483 
1484     /*************************************************************************/
1485     /* Supposing we do hte result update for a unique partid, we can */
1486     /* store the best pt id in the grid and also min cost is return */
1487     /* param. This will be useful for early exit cases.             */
1488     /* TODO : once we have separate fxn for unique part+grid, we can */
1489     /* do away with this code here                                   */
1490     /*************************************************************************/
1491     //if (pi4_valid_part_ids[1] == -1)
1492     i4_unique_id = pi4_valid_part_ids[0];
1493 
1494     /* pi4_valid_part_ids contains all the valid ids. We loop through */
1495     /* this till we encounter -1. This is easier than having to       */
1496     /* figure out part by part, besides, active part decision is      */
1497     /* usually fixed for a given duration of search, e.g. entire fpel */
1498     /* refinement for a blk/cu will use fixed valid part mask         */
1499     id = pi4_valid_part_ids[0];
1500 
1501     /*****************************************************************/
1502     /* points to the best search results corresponding to this       */
1503     /* specific part type.                                           */
1504     /*****************************************************************/
1505     ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1506 
1507     /*************************************************************************/
1508     /* Outer loop runs through all active pts in the grid                    */
1509     /*************************************************************************/
1510     for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1511     {
1512         if(!(i4_grid_mask & (1 << i4_grid_pt)))
1513             continue;
1514 
1515         /* For the pt in the grid, update mvx and y depending on */
1516         /* location of pt. Updates are in FPEL units.            */
1517         ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1518         ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1519         ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1520         ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1521 
1522         {
1523             /* evaluate mv cost and totalcost for this part for this given mv*/
1524             i4_mv_cost = compute_mv_cost_coarse_high_speed(
1525                 ps_search_node_grid,
1526                 &ps_search_results->as_pred_ctxt[pred_lx],
1527                 (PART_ID_T)id,
1528                 MV_RES_FPEL);
1529 
1530             i4_sad = pi4_sad_grid[grid_count * id];
1531             i4_tot_cost = i4_sad + i4_mv_cost;
1532 
1533             ASSERT(i4_unique_id == id);
1534             ASSERT(num_results == 1);
1535 
1536             /*****************************************************************/
1537             /* We do not labor through the results if the total cost worse   */
1538             /* than the last of the results.                                 */
1539             /*****************************************************************/
1540             if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1541             {
1542                 i4_min_id = i4_grid_pt;
1543                 ps_result_prms->i4_min_cost = i4_tot_cost;
1544 
1545                 ps_best_node[0] = *ps_search_node_grid;
1546                 ps_best_node[0].i4_sad = i4_sad;
1547                 ps_best_node[0].i4_mv_cost = i4_mv_cost;
1548                 ps_best_node[0].i4_tot_cost = i4_tot_cost;
1549             }
1550         }
1551         pi4_sad_grid++;
1552     }
1553     ps_result_prms->i4_min_id = i4_min_id;
1554 }
1555 
hme_update_results_grid_pu_bestn(result_upd_prms_t * ps_result_prms)1556 void hme_update_results_grid_pu_bestn(result_upd_prms_t *ps_result_prms)
1557 {
1558     search_node_t s_search_node_grid;
1559     const search_node_t *ps_search_node_base;
1560     search_node_t *ps_search_node_grid, *ps_best_node;
1561     S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1562     S32 num_results, i4_unique_id = -1, i4_grid_pt;
1563     search_results_t *ps_search_results;
1564     S32 *pi4_valid_part_ids;
1565     S32 i4_step = ps_result_prms->i4_step;
1566     S32 i4_grid_mask, i4_count, i, i4_min_id;
1567     S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1568     S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1569     S32 grid_count = 0;
1570     S32 pred_lx;
1571 
1572     i4_min_id = (S32)PT_C;
1573     i4_min_cost = MAX_32BIT_VAL;
1574     ps_search_node_grid = &s_search_node_grid;
1575     ps_search_node_base = ps_result_prms->ps_search_node_base;
1576     *ps_search_node_grid = *ps_search_node_base;
1577     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1578     ps_search_results = ps_result_prms->ps_search_results;
1579     num_results = (S32)ps_search_results->u1_num_results_per_part;
1580     i4_grid_mask = ps_result_prms->i4_grid_mask;
1581 
1582     for(i = 0; i < 9; i++)
1583     {
1584         if(i4_grid_mask & (1 << i))
1585         {
1586             grid_count++;
1587         }
1588     }
1589 
1590     i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1591     pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1592 
1593     i4_unique_id = pi4_valid_part_ids[0];
1594 
1595     /*************************************************************************/
1596     /* Outer loop runs through all active pts in the grid                    */
1597     /*************************************************************************/
1598     for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1599     {
1600         if(!(i4_grid_mask & (1 << i4_grid_pt)))
1601         {
1602             continue;
1603         }
1604 
1605         /* For the pt in the grid, update mvx and y depending on */
1606         /* location of pt. Updates are in FPEL units.            */
1607         ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1608         ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1609         ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1610         ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1611 
1612         i4_count = 0;
1613 
1614         while((id = pi4_valid_part_ids[i4_count]) >= 0)
1615         {
1616             /*****************************************************************/
1617             /* points to the best search results corresponding to this       */
1618             /* specific part type.                                           */
1619             /*****************************************************************/
1620             ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1621 
1622             /* evaluate mv cost and totalcost for this part for this given mv*/
1623             i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1624                 ps_search_node_grid,
1625                 &ps_search_results->as_pred_ctxt[pred_lx],
1626                 (PART_ID_T)id,
1627                 MV_RES_FPEL);
1628 
1629             i4_sad = pi4_sad_grid[grid_count * id];
1630             i4_tot_cost = i4_sad + i4_mv_cost;
1631 
1632             if(i4_unique_id == id)
1633             {
1634                 if(i4_tot_cost < ps_result_prms->i4_min_cost)
1635                 {
1636                     i4_min_id = i4_grid_pt;
1637                     ps_result_prms->i4_min_cost = i4_tot_cost;
1638                 }
1639             }
1640 
1641             if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1642             {
1643                 for(i = 0; i < num_results - 1; i++)
1644                 {
1645                     if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1646                     {
1647                         memmove(
1648                             ps_best_node + i + 1,
1649                             ps_best_node + i,
1650                             sizeof(search_node_t) * (num_results - 1 - i));
1651                         break;
1652                     }
1653                     else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1654                     {
1655                         if(0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node + i))
1656                             break;
1657                     }
1658                 }
1659                 ps_best_node[i] = *ps_search_node_grid;
1660                 ps_best_node[i].i4_sad = i4_sad;
1661                 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1662                 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1663             }
1664             i4_count++;
1665         }
1666         pi4_sad_grid++;
1667     }
1668     ps_result_prms->i4_min_id = i4_min_id;
1669 }
1670 
1671 /**
1672 ********************************************************************************
1673 *  @fn     hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1674 *
1675 *  @brief  Updates results for the case where 1 best result is to be updated
1676 *          for a given pt, for several parts
1677 *          Note : The function is replicated for CLIPing the cost to 16bit to make
1678 *                  bit match with SIMD version
1679 *
1680 *  @param[in]  result_upd_prms_t : Contains the input parameters to this fxn
1681 *
1682 *  @return   The result_upd_prms_t structure is updated for all the active
1683 *            parts in case the current candt has results for any given part
1684 *             that is the best result for that part
1685 ********************************************************************************
1686 */
hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t * ps_result_prms)1687 void hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1688 {
1689     search_node_t s_search_node_grid;
1690     const search_node_t *ps_search_node_base;
1691     search_node_t *ps_search_node_grid, *ps_best_node;
1692     S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1693     S32 num_results, i4_unique_id = -1, i4_grid_pt;
1694     search_results_t *ps_search_results;
1695     S32 *pi4_valid_part_ids;
1696     S32 i4_step = ps_result_prms->i4_step;
1697     S32 i4_grid_mask, i4_count, i, i4_min_id;
1698     S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1699     S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1700     S32 grid_count = 0;
1701     S32 pred_lx;
1702 
1703     i4_min_id = (S32)PT_C;
1704     i4_min_cost = MAX_32BIT_VAL;
1705     ps_search_node_grid = &s_search_node_grid;
1706     ps_search_node_base = ps_result_prms->ps_search_node_base;
1707     *ps_search_node_grid = *ps_search_node_base;
1708     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1709     ps_search_results = ps_result_prms->ps_search_results;
1710     num_results = (S32)ps_search_results->u1_num_results_per_part;
1711     i4_grid_mask = ps_result_prms->i4_grid_mask;
1712 
1713     for(i = 0; i < 9; i++)
1714     {
1715         if(i4_grid_mask & (1 << i))
1716             grid_count++;
1717     }
1718 
1719     /* Some basic assumptions: only single pt, only part updates */
1720     /* and more than 1 best result to be computed.               */
1721 
1722     i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1723     pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1724 
1725     /*************************************************************************/
1726     /* Supposing we do hte result update for a unique partid, we can */
1727     /* store the best pt id in the grid and also min cost is return */
1728     /* param. This will be useful for early exit cases.             */
1729     /* TODO : once we have separate fxn for unique part+grid, we can */
1730     /* do away with this code here                                   */
1731     /*************************************************************************/
1732     //if (pi4_valid_part_ids[1] == -1)
1733     i4_unique_id = pi4_valid_part_ids[0];
1734 
1735     /*************************************************************************/
1736     /* Outer loop runs through all active pts in the grid                    */
1737     /*************************************************************************/
1738     for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1739     {
1740         if(!(i4_grid_mask & (1 << i4_grid_pt)))
1741             continue;
1742 
1743         /* For the pt in the grid, update mvx and y depending on */
1744         /* location of pt. Updates are in FPEL units.            */
1745         ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1746         ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1747         ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1748         ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1749 
1750         i4_count = 0;
1751 
1752         /* pi4_valid_part_ids contains all the valid ids. We loop through */
1753         /* this till we encounter -1. This is easier than having to       */
1754         /* figure out part by part, besides, active part decision is      */
1755         /* usually fixed for a given duration of search, e.g. entire fpel */
1756         /* refinement for a blk/cu will use fixed valid part mask         */
1757 
1758         while((id = pi4_valid_part_ids[i4_count]) >= 0)
1759         {
1760             //ps_search_node_grid->e_part_type = (PART_TYPE_T)id;
1761 
1762             /*****************************************************************/
1763             /* points to the best search results corresponding to this       */
1764             /* specific part type.                                           */
1765             /*****************************************************************/
1766             ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1767 
1768             /* evaluate mv cost and totalcost for this part for this given mv*/
1769             i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1770                 ps_search_node_grid,
1771                 &ps_search_results->as_pred_ctxt[pred_lx],
1772                 (PART_ID_T)id,
1773                 MV_RES_FPEL);
1774 
1775             i4_sad = pi4_sad_grid[grid_count * id];
1776 
1777             /* Clipping to 16 bit to bit match with SIMD version */
1778             i4_mv_cost = CLIP_S16(i4_mv_cost);
1779             i4_sad = CLIP_S16(i4_sad);
1780 
1781             i4_tot_cost = i4_sad + i4_mv_cost;
1782             /* Clipping to 16 bit to bit match with SIMD version */
1783             i4_tot_cost = CLIP_S16(i4_tot_cost);
1784 
1785             if(i4_unique_id == id)
1786             {
1787                 if(i4_tot_cost < ps_result_prms->i4_min_cost)
1788                 {
1789                     i4_min_id = i4_grid_pt;
1790                     ps_result_prms->i4_min_cost = i4_tot_cost;
1791                 }
1792             }
1793 
1794             /*****************************************************************/
1795             /* We do not labor through the results if the total cost worse   */
1796             /* than the last of the results.                                 */
1797             /*****************************************************************/
1798             if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1799             {
1800                 S32 eq_cost = 0;
1801                 /*************************************************************/
1802                 /* Identify where the current result isto be placed.Basically*/
1803                 /* find the node which has cost just higher thannodeundertest*/
1804                 /*************************************************************/
1805                 for(i = 0; i < num_results - 1; i++)
1806                 {
1807                     if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1808                     {
1809                         memmove(
1810                             ps_best_node + i + 1,
1811                             ps_best_node + i,
1812                             sizeof(search_node_t) * (num_results - 1 - i));
1813                         break;
1814                     }
1815                     else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1816                     {
1817                         //if (0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node+i))
1818                         //  break;
1819                         /* When cost is same we comp. the nodes and if it's same skip. */
1820                         /* We don't want to add this code to intrinsic. So we are      */
1821                         /* commenting it. The quality impact was minor when we did the */
1822                         /* regression.                                                 */
1823                         eq_cost = 1;
1824                     }
1825                 }
1826                 if(!eq_cost)
1827                 {
1828                     ps_best_node[i] = *ps_search_node_grid;
1829                     ps_best_node[i].i4_sad = i4_sad;
1830                     ps_best_node[i].i4_mv_cost = i4_mv_cost;
1831                     ps_best_node[i].i4_tot_cost = i4_tot_cost;
1832                 }
1833             }
1834             i4_count++;
1835         }
1836         pi4_sad_grid++;
1837     }
1838     ps_result_prms->i4_min_id = i4_min_id;
1839 }
1840 
1841 /**
1842 ********************************************************************************
1843 *  @fn     hme_update_results_pt_npu_best1(result_upd_prms_t *ps_result_prms)
1844 *
1845 *  @brief  Updates results for the case where 1 best result is to be updated
1846 *          for a given pt, for several parts
1847 *
1848 *  @param[in]  ps_result_prms. Contains the input parameters to this fxn
1849 *              ::ps_pred_info : contains cost fxn ptr and predictor info
1850 *              ::pi4_sad : 17x9 SAD Grid, this case, only 1st 17 entries valid
1851 *              ::ps_search_results: Search results structure
1852 *              ::i1_ref_id : Reference index
1853 *              ::i4_grid_mask: Dont Care for this fxn
1854 *              ::pi4_valid_part_ids : valid part ids
1855 *              ::ps_search_node_base: Contains the centre pt candt info.
1856 *
1857 *  @return   The ps_search_results structure is updated for all the active
1858 *            parts in case the current candt has results for any given part
1859 *             that is the best result for that part
1860 ********************************************************************************
1861 */
1862 
hme_update_results_pt_pu_best1_subpel_hs(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)1863 void hme_update_results_pt_pu_best1_subpel_hs(
1864     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1865 {
1866     search_node_t *ps_search_node_base, *ps_best_node;
1867     search_results_t *ps_search_results;
1868     S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1869     S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1870     S32 num_results, i;
1871     S32 *pi4_valid_part_ids;
1872 
1873     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1874     /* Some basic assumptions: only single pt, only part updates */
1875     /* and more than 1 best result to be computed.               */
1876     ASSERT(ps_result_prms->i4_grid_mask == 1);
1877 
1878     ps_search_results = ps_result_prms->ps_search_results;
1879     num_results = (S32)ps_search_results->u1_num_results_per_part;
1880 
1881     /* Compute mv cost, total cost */
1882     ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1883 
1884     while((id = pi4_valid_part_ids[i4_count]) >= 0)
1885     {
1886         S32 update_required = 1;
1887 
1888         ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1889         /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1890         i4_mv_cost = ps_best_node->i4_mv_cost;
1891         i4_sad = ps_result_prms->pi4_sad_grid[id];
1892         i4_tot_cost = i4_sad + i4_mv_cost;
1893 
1894         /* We do not labor through the results if the total cost is worse than   */
1895         /* the last of the results.                                              */
1896         if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1897         {
1898             /* Identify where the current result is to be placed. Basically find  */
1899             /* the node which has cost just higher than node under test           */
1900             for(i = 0; i < num_results - 1; i++)
1901             {
1902                 if(ps_best_node[i].i1_ref_idx != -1)
1903                 {
1904                     if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1905                     {
1906                         memmove(
1907                             ps_best_node + i + 1,
1908                             ps_best_node + i,
1909                             sizeof(search_node_t) * (num_results - 1 - i));
1910                         break;
1911                     }
1912                     else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1913                     {
1914                         update_required = 0;
1915                         break;
1916                     }
1917                 }
1918                 else
1919                 {
1920                     break;
1921                 }
1922             }
1923 
1924             if(update_required)
1925             {
1926                 /* Update when either ref_idx or mv's are different */
1927                 ps_best_node[i] = *ps_search_node_base;
1928                 ps_best_node[i].i4_sad = i4_sad;
1929                 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1930                 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1931             }
1932         }
1933         i4_count++;
1934     }
1935 }
1936 
hme_update_results_pt_pu_best1_subpel_hs_1(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)1937 void hme_update_results_pt_pu_best1_subpel_hs_1(
1938     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1939 {
1940     search_node_t *ps_search_node_base, *ps_best_node;
1941     search_results_t *ps_search_results;
1942     S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1943     S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1944     S32 num_results;
1945     S32 *pi4_valid_part_ids;
1946 
1947     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1948     /* Some basic assumptions: only single pt, only part updates */
1949     /* and more than 1 best result to be computed.               */
1950     ASSERT(ps_result_prms->i4_grid_mask == 1);
1951 
1952     ps_search_results = ps_result_prms->ps_search_results;
1953     num_results = (S32)ps_search_results->u1_num_results_per_part;
1954 
1955     /* Compute mv cost, total cost */
1956     ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1957 
1958     while((id = pi4_valid_part_ids[i4_count]) >= 0)
1959     {
1960         S32 update_required = 0;
1961 
1962         ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1963         /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1964         i4_mv_cost = ps_best_node->i4_mv_cost;
1965         i4_sad = ps_result_prms->pi4_sad_grid[id];
1966         i4_tot_cost = i4_sad + i4_mv_cost;
1967 
1968         /* We do not labor through the results if the total cost is worse than   */
1969         /* the last of the results.                                              */
1970         if(i4_tot_cost < ps_best_node[1].i4_tot_cost)
1971         {
1972             S32 sdi_value = 0;
1973 
1974             update_required = 2;
1975             /* Identify where the current result is to be placed. Basically find  */
1976             /* the node which has cost just higher than node under test           */
1977             {
1978                 if(i4_tot_cost < ps_best_node[0].i4_tot_cost)
1979                 {
1980                     update_required = 1;
1981                     sdi_value = ps_best_node[0].i4_sad - i4_sad;
1982                 }
1983                 else if(
1984                     (ps_result_prms->i2_mv_x == ps_best_node[0].s_mv.i2_mvx) &&
1985                     (ps_result_prms->i2_mv_y == ps_best_node[0].s_mv.i2_mvy) &&
1986                     (ps_best_node[0].i1_ref_idx == ps_result_prms->i1_ref_idx))
1987                 {
1988                     update_required = 0;
1989                 }
1990             }
1991             if(update_required == 2)
1992             {
1993                 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1994 
1995                 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] = i4_tot_cost;
1996                 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] = i4_mv_cost;
1997                 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] = ps_result_prms->i2_mv_x;
1998                 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] = ps_result_prms->i2_mv_y;
1999                 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] = ps_result_prms->i1_ref_idx;
2000             }
2001             else if(update_required == 1)
2002             {
2003                 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
2004 
2005                 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] =
2006                     ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count];
2007                 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] =
2008                     ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count];
2009                 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] =
2010                     ps_subpel_refine_ctxt->i2_mv_x[0][i4_count];
2011                 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] =
2012                     ps_subpel_refine_ctxt->i2_mv_y[0][i4_count];
2013                 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] =
2014                     ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count];
2015 
2016                 ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] = i4_tot_cost;
2017                 ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count] = i4_mv_cost;
2018                 ps_subpel_refine_ctxt->i2_mv_x[0][i4_count] = ps_result_prms->i2_mv_x;
2019                 ps_subpel_refine_ctxt->i2_mv_y[0][i4_count] = ps_result_prms->i2_mv_y;
2020                 ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count] = ps_result_prms->i1_ref_idx;
2021             }
2022         }
2023         i4_count++;
2024     }
2025 }
2026 
2027 /**
2028 ******************************************************************************
2029 *  @brief Gives a result fxn ptr for a index [x] where x is as:
2030 *         0 : single pt, no partial updates, 1 best result
2031 *         1 : single pt, no partial updates, N best results
2032 *         2 : single pt,    partial updates, 1 best result
2033 *         3 : single pt,    partial updates, N best results
2034 *         0 : grid     , no partial updates, 1 best result
2035 *         1 : grid     , no partial updates, N best results
2036 *         2 : grid     ,    partial updates, 1 best result
2037 *         3 : grid     ,    partial updates, N best results
2038 ******************************************************************************
2039 */
2040 
2041 static PF_RESULT_FXN_T g_pf_result_fxn[8] = { UPD_RES_PT_NPU_BEST1,   UPD_RES_PT_NPU_BESTN,
2042                                               UPD_RES_PT_PU_BEST1,    UPD_RES_PT_PU_BESTN,
2043                                               UPD_RES_GRID_NPU_BEST1, UPD_RES_GRID_NPU_BESTN,
2044                                               UPD_RES_GRID_PU_BEST1,  UPD_RES_GRID_PU_BESTN };
2045 
2046 /**
2047 ********************************************************************************
2048 *  @fn     hme_get_result_fxn(i4_grid_mask, i4_part_mask, i4_num_results)
2049 *
2050 *  @brief  Obtains the suitable result function that evaluates COST and also
2051 *           computes one or more best results for point/grid, single part or
2052 *           more than one part.
2053 *
2054 *  @param[in]  i4_grid_mask : Mask containing which of 9 grid pts active
2055 *
2056 *  @param[in]  i4_part_mask : Mask containing which of the 17 parts active
2057 *
2058 *  @param[in]  i4_num_results: Number of active results
2059 *
2060 *  @return   Pointer to the appropriate result update function
2061 ********************************************************************************
2062 */
hme_get_result_fxn(S32 i4_grid_mask,S32 i4_part_mask,S32 i4_num_results)2063 PF_RESULT_FXN_T hme_get_result_fxn(S32 i4_grid_mask, S32 i4_part_mask, S32 i4_num_results)
2064 {
2065     S32 i4_is_grid = (i4_grid_mask != 1);
2066     S32 i4_is_pu = ((i4_part_mask & (i4_part_mask - 1)) != 0);
2067     S32 i4_res_gt1 = (i4_num_results > 1);
2068     S32 id;
2069 
2070     id = (i4_is_grid << 2) + (i4_is_pu << 1) + i4_res_gt1;
2071 
2072     return (g_pf_result_fxn[id]);
2073 }
2074 
hme_calc_sad_and_2_best_results(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2075 void hme_calc_sad_and_2_best_results(
2076     hme_search_prms_t *ps_search_prms,
2077     wgt_pred_ctxt_t *ps_wt_inp_prms,
2078     err_prms_t *ps_err_prms,
2079     result_upd_prms_t *ps_result_prms,
2080     U08 **ppu1_ref,
2081     S32 i4_ref_stride)
2082 {
2083     S32 i4_candt;
2084     S32 i4_inp_off;
2085     S32 i4_ref_offset;
2086     S32 i4_num_nodes;
2087 
2088     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2089     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2090     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2091     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2092     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2093 
2094     mv_refine_ctxt_t *ps_mv_refine_ctxt;
2095     search_node_t *ps_search_node;
2096 
2097     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2098     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2099     i4_inp_off = ps_search_prms->i4_cu_x_off;
2100     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2101     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2102     ps_search_node = ps_search_prms->ps_search_nodes;
2103 
2104     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2105     {
2106         /**********************************************************************/
2107         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2108         /**********************************************************************/
2109         {
2110             WORD32 b, c, d;
2111             UWORD8 *pu1_cur_ptr;
2112             UWORD8 *pu1_ref_ptr;
2113             UWORD16 au2_4x4_sad[NUM_4X4];
2114 
2115             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2116             {
2117                 continue;
2118             }
2119 
2120             ps_err_prms->pu1_inp =
2121                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2122             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2123             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2124             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2125 
2126             pu1_cur_ptr = ps_err_prms->pu1_inp;
2127             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2128 
2129             /* Loop to compute the SAD's */
2130             {
2131                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2132                 for(b = 0; b < NUM_4X4; b++)
2133                 {
2134                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2135                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2136 
2137                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2138                     {
2139                         WORD32 z_cur = (cur_buf_stride)*c + t1;
2140                         WORD32 z_ref = (ref_buf_stride)*c + t2;
2141                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2142                         {
2143                             au2_4x4_sad[b] += (UWORD16)ABS((
2144                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2145                         }
2146                     }
2147                 }
2148 
2149                 pi4_sad_grid[PART_ID_NxN_TL] =
2150                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2151                 pi4_sad_grid[PART_ID_NxN_TR] =
2152                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2153                 pi4_sad_grid[PART_ID_NxN_BL] =
2154                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2155                 pi4_sad_grid[PART_ID_NxN_BR] =
2156                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2157                 pi4_sad_grid[PART_ID_Nx2N_L] =
2158                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2159                 pi4_sad_grid[PART_ID_Nx2N_R] =
2160                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2161                 pi4_sad_grid[PART_ID_2NxN_T] =
2162                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2163                 pi4_sad_grid[PART_ID_2NxN_B] =
2164                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2165                 pi4_sad_grid[PART_ID_nLx2N_L] =
2166                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2167                 pi4_sad_grid[PART_ID_nRx2N_R] =
2168                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2169                 pi4_sad_grid[PART_ID_2NxnU_T] =
2170                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2171                 pi4_sad_grid[PART_ID_2NxnD_B] =
2172                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2173                 pi4_sad_grid[PART_ID_2Nx2N] =
2174                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2175                 pi4_sad_grid[PART_ID_2NxnU_B] =
2176                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2177                 pi4_sad_grid[PART_ID_2NxnD_T] =
2178                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2179                 pi4_sad_grid[PART_ID_nRx2N_L] =
2180                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2181                 pi4_sad_grid[PART_ID_nLx2N_R] =
2182                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2183             }
2184         }
2185 
2186         {
2187             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2188             S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2189             S32 best_node_cost;
2190             S32 second_best_node_cost;
2191 
2192             {
2193                 S16 mvdx1, mvdy1;
2194                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2195                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2196                 S32 pred_lx = i4_search_idx;
2197 
2198                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2199                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2200                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2201 
2202                 S32 inp_shift = 2;
2203                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2204                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2205                 S32 lambda = ps_pred_ctxt->lambda;
2206                 S32 rnd = 1 << (lambda_q_shift - 1);
2207                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2208                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2209                 S32 ref_bits =
2210                     ps_pred_ctxt
2211                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2212 
2213                 COMPUTE_DIFF_MV(
2214                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2215 
2216                 mvdx1 = ABS(mvdx1);
2217                 mvdy1 = ABS(mvdy1);
2218 
2219                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2220                              (mvdy1 > 0) + ref_bits + 2;
2221 
2222                 i4_mv_cost *= lambda;
2223                 i4_mv_cost += rnd;
2224                 i4_mv_cost >>= lambda_q_shift;
2225 
2226                 i4_mv_cost = CLIP_U16(i4_mv_cost);
2227             }
2228 
2229             /*For each valid partition, update the refine_prm structure to reflect the best and second
2230             best candidates for that partition*/
2231 
2232             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2233             {
2234                 S32 update_required = 0;
2235                 S32 part_id = pi4_valid_part_ids[i4_count];
2236                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2237 
2238                 /*Calculate total cost*/
2239                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2240                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2241 
2242                 /*****************************************************************/
2243                 /* We do not labor through the results if the total cost worse   */
2244                 /* than the last of the results.                                 */
2245                 /*****************************************************************/
2246                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
2247                 second_best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[1][index]);
2248 
2249                 if(i4_tot_cost < second_best_node_cost)
2250                 {
2251                     update_required = 2;
2252 
2253                     /*************************************************************/
2254                     /* Identify where the current result isto be placed.Basically*/
2255                     /* find the node which has cost just higher thannodeundertest*/
2256                     /*************************************************************/
2257                     if(i4_tot_cost < best_node_cost)
2258                     {
2259                         update_required = 1;
2260                     }
2261                     else if(i4_tot_cost == best_node_cost)
2262                     {
2263                         update_required = 0;
2264                     }
2265 
2266                     if(update_required == 2)
2267                     {
2268                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2269                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2270                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2271                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2272                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2273                     }
2274                     else if(update_required == 1)
2275                     {
2276                         ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2277                             ps_mv_refine_ctxt->i2_tot_cost[0][index];
2278                         ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2279                             ps_mv_refine_ctxt->i2_mv_cost[0][index];
2280                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2281                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2282                         ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2283                             ps_mv_refine_ctxt->i2_ref_idx[0][index];
2284 
2285                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2286                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2287                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2288                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2289                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2290                     }
2291                 }
2292             }
2293         }
2294         ps_search_node++;
2295     }
2296 
2297     {
2298         WORD32 i4_i;
2299         WORD32 part_id;
2300         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2301         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2302         {
2303             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2304             if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2305             {
2306                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2307                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2308                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2309 
2310                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2311             }
2312             if(ps_mv_refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2313             {
2314                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2315                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2316                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2317 
2318                 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2319             }
2320         }
2321     }
2322 }
2323 
hme_calc_sad_and_2_best_results_subpel(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)2324 void hme_calc_sad_and_2_best_results_subpel(
2325     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
2326 {
2327     S32 i4_candt;
2328     S32 i4_num_nodes;
2329 
2330     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2331     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2332     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2333     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2334     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2335 
2336     mv_refine_ctxt_t *ps_subpel_refine_ctxt;
2337     ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
2338     i4_num_nodes = 1;
2339 
2340     /* Run through each of the candts in a loop */
2341     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2342     {
2343         /**********************************************************************/
2344         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2345         /**********************************************************************/
2346         {
2347             WORD32 b, c, d;
2348             UWORD8 *pu1_cur_ptr;
2349             UWORD8 *pu1_ref_ptr;
2350             UWORD16 au2_4x4_sad[NUM_4X4];
2351 
2352             pu1_cur_ptr = ps_err_prms->pu1_inp;
2353             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2354 
2355             /* Loop to compute the SAD's */
2356             {
2357                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2358                 for(b = 0; b < NUM_4X4; b++)
2359                 {
2360                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2361                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2362 
2363                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2364                     {
2365                         WORD32 z_cur = (cur_buf_stride)*c + t1;
2366                         WORD32 z_ref = (ref_buf_stride)*c + t2;
2367                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2368                         {
2369                             au2_4x4_sad[b] += (UWORD16)ABS((
2370                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2371                         }
2372                     }
2373                 }
2374 
2375                 pi4_sad_grid[PART_ID_NxN_TL] =
2376                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2377                 pi4_sad_grid[PART_ID_NxN_TR] =
2378                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2379                 pi4_sad_grid[PART_ID_NxN_BL] =
2380                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2381                 pi4_sad_grid[PART_ID_NxN_BR] =
2382                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2383                 pi4_sad_grid[PART_ID_Nx2N_L] =
2384                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2385                 pi4_sad_grid[PART_ID_Nx2N_R] =
2386                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2387                 pi4_sad_grid[PART_ID_2NxN_T] =
2388                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2389                 pi4_sad_grid[PART_ID_2NxN_B] =
2390                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2391                 pi4_sad_grid[PART_ID_nLx2N_L] =
2392                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2393                 pi4_sad_grid[PART_ID_nRx2N_R] =
2394                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2395                 pi4_sad_grid[PART_ID_2NxnU_T] =
2396                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2397                 pi4_sad_grid[PART_ID_2NxnD_B] =
2398                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2399                 pi4_sad_grid[PART_ID_2Nx2N] =
2400                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2401                 pi4_sad_grid[PART_ID_2NxnU_B] =
2402                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2403                 pi4_sad_grid[PART_ID_2NxnD_T] =
2404                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2405                 pi4_sad_grid[PART_ID_nRx2N_L] =
2406                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2407                 pi4_sad_grid[PART_ID_nLx2N_R] =
2408                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2409             }
2410         }
2411         /**********************************************************************/
2412         /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
2413         /**********************************************************************/
2414         {
2415             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2416             S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
2417             S32 best_node_cost;
2418             S32 second_best_node_cost;
2419 
2420             /*For each valid partition, update the refine_prm structure to reflect the best and second
2421             best candidates for that partition*/
2422 
2423             for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
2424             {
2425                 S32 update_required = 0;
2426                 S32 part_id = pi4_valid_part_ids[i4_count];
2427                 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2428 
2429                 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
2430                 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2431 
2432                 /*Calculate total cost*/
2433                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2434                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2435 
2436                 /*****************************************************************/
2437                 /* We do not labor through the results if the total cost worse   */
2438                 /* than the last of the results.                                 */
2439                 /*****************************************************************/
2440                 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
2441                 second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
2442 
2443                 if(i4_tot_cost < second_best_node_cost)
2444                 {
2445                     update_required = 2;
2446 
2447                     /*************************************************************/
2448                     /* Identify where the current result isto be placed.Basically*/
2449                     /* find the node which has cost just higher thannodeundertest*/
2450                     /*************************************************************/
2451                     if(i4_tot_cost < best_node_cost)
2452                     {
2453                         update_required = 1;
2454                     }
2455                     else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
2456                     {
2457                         update_required = 0;
2458                     }
2459                     if(update_required == 2)
2460                     {
2461                         ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2462                         ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2463                         ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
2464                         ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
2465                         ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
2466                     }
2467                     else if(update_required == 1)
2468                     {
2469                         ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
2470                             ps_subpel_refine_ctxt->i2_tot_cost[0][index];
2471                         ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
2472                             ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2473                         ps_subpel_refine_ctxt->i2_mv_x[1][index] =
2474                             ps_subpel_refine_ctxt->i2_mv_x[0][index];
2475                         ps_subpel_refine_ctxt->i2_mv_y[1][index] =
2476                             ps_subpel_refine_ctxt->i2_mv_y[0][index];
2477                         ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
2478                             ps_subpel_refine_ctxt->i2_ref_idx[0][index];
2479 
2480                         ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2481                         ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2482                         ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
2483                         ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
2484                         ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
2485                     }
2486                 }
2487             }
2488         }
2489     }
2490 
2491     {
2492         WORD32 i4_count = 0;
2493         for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
2494         {
2495             WORD32 j;
2496             for(j = 0; j < 2; j++)
2497             {
2498                 if(ps_subpel_refine_ctxt->i2_tot_cost[j][i4_count] >= MAX_SIGNED_16BIT_VAL)
2499                 {
2500                     ps_subpel_refine_ctxt->ai2_fullpel_satd[j][i4_count] = MAX_SIGNED_16BIT_VAL;
2501                 }
2502             }
2503         }
2504     }
2505 }
2506 
hme_calc_stim_injected_sad_and_2_best_results(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2507 void hme_calc_stim_injected_sad_and_2_best_results(
2508     hme_search_prms_t *ps_search_prms,
2509     wgt_pred_ctxt_t *ps_wt_inp_prms,
2510     err_prms_t *ps_err_prms,
2511     result_upd_prms_t *ps_result_prms,
2512     U08 **ppu1_ref,
2513     S32 i4_ref_stride)
2514 {
2515     mv_refine_ctxt_t *ps_mv_refine_ctxt;
2516     search_node_t *ps_search_node;
2517 
2518     S32 i4_candt;
2519     S32 i4_count;
2520     S32 i4_inp_off;
2521     S32 i4_ref_offset;
2522     S32 i4_num_nodes;
2523     ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
2524         au8_final_ref_sigmaXSquared[17];
2525     UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
2526     S32 *pi4_valid_part_ids;
2527 
2528     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2529     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2530     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2531     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2532     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2533 
2534     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2535     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2536     i4_inp_off = ps_search_prms->i4_cu_x_off;
2537     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2538     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2539     ps_search_node = ps_search_prms->ps_search_nodes;
2540     pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2541 
2542     /* Set local pointer to point to partition level sigma values calculated in hme_refine */
2543     au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
2544     au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
2545 
2546     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2547     {
2548         {
2549             WORD32 b, c, d;
2550             UWORD8 *pu1_cur_ptr;
2551             UWORD8 *pu1_ref_ptr;
2552             UWORD16 au2_4x4_sad[NUM_4X4];
2553 
2554             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2555             {
2556                 continue;
2557             }
2558 
2559             ps_err_prms->pu1_inp =
2560                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2561             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2562             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2563             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2564 
2565             pu1_cur_ptr = ps_err_prms->pu1_inp;
2566             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2567 
2568             /* Loop to compute the SAD's */
2569             {
2570                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2571                 for(b = 0; b < NUM_4X4; b++)
2572                 {
2573                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2574                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2575 
2576                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2577                     {
2578                         WORD32 z_cur = (cur_buf_stride)*c + t1;
2579                         WORD32 z_ref = (ref_buf_stride)*c + t2;
2580                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2581                         {
2582                             au2_4x4_sad[b] += (UWORD16)ABS((
2583                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2584                         }
2585                     }
2586                 }
2587 
2588                 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
2589                 hme_compute_sigmaX_and_sigmaXSquared(
2590                     pu1_ref_ptr,
2591                     ref_buf_stride,
2592                     au4_4x4_ref_sigmaX,
2593                     au4_4x4_ref_sigmaXSquared,
2594                     4,
2595                     4,
2596                     16,
2597                     16,
2598                     1,
2599                     4);
2600 
2601                 pi4_sad_grid[PART_ID_NxN_TL] =
2602                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2603                 pi4_sad_grid[PART_ID_NxN_TR] =
2604                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2605                 pi4_sad_grid[PART_ID_NxN_BL] =
2606                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2607                 pi4_sad_grid[PART_ID_NxN_BR] =
2608                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2609                 pi4_sad_grid[PART_ID_Nx2N_L] =
2610                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2611                 pi4_sad_grid[PART_ID_Nx2N_R] =
2612                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2613                 pi4_sad_grid[PART_ID_2NxN_T] =
2614                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2615                 pi4_sad_grid[PART_ID_2NxN_B] =
2616                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2617                 pi4_sad_grid[PART_ID_nLx2N_L] =
2618                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2619                 pi4_sad_grid[PART_ID_nRx2N_R] =
2620                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2621                 pi4_sad_grid[PART_ID_2NxnU_T] =
2622                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2623                 pi4_sad_grid[PART_ID_2NxnD_B] =
2624                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2625                 pi4_sad_grid[PART_ID_2Nx2N] =
2626                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2627                 pi4_sad_grid[PART_ID_2NxnU_B] =
2628                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2629                 pi4_sad_grid[PART_ID_2NxnD_T] =
2630                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2631                 pi4_sad_grid[PART_ID_nRx2N_L] =
2632                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2633                 pi4_sad_grid[PART_ID_nLx2N_R] =
2634                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2635             }
2636         }
2637 
2638         {
2639             S32 i4_sad, i4_mv_cost, i4_tot_cost;
2640             S32 best_node_cost;
2641             S32 second_best_node_cost;
2642             ULWORD64 u8_temp_var, u8_temp_var1;
2643             ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
2644 
2645             {
2646                 S16 mvdx1, mvdy1;
2647                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2648                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2649                 S32 pred_lx = i4_search_idx;
2650 
2651                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2652                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2653                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2654 
2655                 S32 inp_shift = 2;
2656                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2657                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2658                 S32 lambda = ps_pred_ctxt->lambda;
2659                 S32 rnd = 1 << (lambda_q_shift - 1);
2660                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2661                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2662                 S32 ref_bits =
2663                     ps_pred_ctxt
2664                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2665 
2666                 COMPUTE_DIFF_MV(
2667                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2668 
2669                 mvdx1 = ABS(mvdx1);
2670                 mvdy1 = ABS(mvdy1);
2671 
2672                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2673                              (mvdy1 > 0) + ref_bits + 2;
2674 
2675                 i4_mv_cost *= lambda;
2676                 i4_mv_cost += rnd;
2677                 i4_mv_cost >>= lambda_q_shift;
2678 
2679                 i4_mv_cost = CLIP_U16(i4_mv_cost);
2680             }
2681 
2682             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2683             {
2684                 S32 i4_stim_injected_sad;
2685                 S32 i4_stim_injected_cost;
2686                 S32 i4_noise_term;
2687                 unsigned long u4_shift_val;
2688                 S32 i4_bits_req;
2689 
2690                 S32 update_required = 0;
2691                 S32 part_id = pi4_valid_part_ids[i4_count];
2692                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2693 
2694                 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
2695 
2696                 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
2697 
2698                 if(ps_search_prms->i4_alpha_stim_multiplier)
2699                 {
2700                     /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
2701                     hme_compute_final_sigma_of_pu_from_base_blocks(
2702                         au4_4x4_ref_sigmaX,
2703                         au4_4x4_ref_sigmaXSquared,
2704                         au8_final_ref_sigmaX,
2705                         au8_final_ref_sigmaXSquared,
2706                         16,
2707                         4,
2708                         part_id,
2709                         4);
2710 
2711                     u8_ref_X_Square =
2712                         (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
2713                     u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
2714 
2715                     /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
2716                     /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
2717                     /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
2718                     u4_shift_val = ihevce_calc_stim_injected_variance(
2719                         au8_final_src_sigmaX,
2720                         au8_final_src_sigmaXSquared,
2721                         &u8_src_var,
2722                         i4_inv_wt,
2723                         ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
2724                         ps_wt_inp_prms->wpred_log_wdc,
2725                         part_id);
2726 
2727                     u8_ref_var = u8_ref_var >> u4_shift_val;
2728 
2729                     /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
2730                     GETRANGE64(i4_bits_req, u8_ref_var);
2731 
2732                     if(i4_bits_req > 27)
2733                     {
2734                         u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
2735                         u8_src_var = u8_src_var >> (i4_bits_req - 27);
2736                     }
2737 
2738                     if(u8_src_var == u8_ref_var)
2739                     {
2740                         u8_temp_var = (1 << STIM_Q_FORMAT);
2741                     }
2742                     else
2743                     {
2744                         u8_temp_var = (2 * u8_src_var * u8_ref_var);
2745                         u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
2746                         u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
2747                         u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
2748                         u8_temp_var = (u8_temp_var / u8_temp_var1);
2749                     }
2750 
2751                     i4_noise_term = (UWORD32)u8_temp_var;
2752 
2753                     ASSERT(i4_noise_term >= 0);
2754 
2755                     i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
2756                 }
2757                 else
2758                 {
2759                     i4_noise_term = 0;
2760                 }
2761                 u8_pure_dist = pi4_sad_grid[part_id];
2762                 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
2763                 u8_pure_dist += (1 << ((i4_q_level)-1));
2764                 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
2765 
2766                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2767                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2768                 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
2769                 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
2770 
2771                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
2772                 second_best_node_cost =
2773                     CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[1][index]);
2774 
2775                 if(i4_stim_injected_cost < second_best_node_cost)
2776                 {
2777                     update_required = 2;
2778 
2779                     if(i4_stim_injected_cost < best_node_cost)
2780                     {
2781                         update_required = 1;
2782                     }
2783                     else if(i4_stim_injected_cost == best_node_cost)
2784                     {
2785                         update_required = 0;
2786                     }
2787 
2788                     if(update_required == 2)
2789                     {
2790                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2791                         ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
2792                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2793                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2794                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2795                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2796                     }
2797                     else if(update_required == 1)
2798                     {
2799                         ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2800                             ps_mv_refine_ctxt->i2_tot_cost[0][index];
2801                         ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] =
2802                             ps_mv_refine_ctxt->i2_stim_injected_cost[0][index];
2803                         ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2804                             ps_mv_refine_ctxt->i2_mv_cost[0][index];
2805                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2806                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2807                         ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2808                             ps_mv_refine_ctxt->i2_ref_idx[0][index];
2809 
2810                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2811                         ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
2812                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2813                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2814                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2815                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2816                     }
2817                 }
2818             }
2819         }
2820 
2821         ps_search_node++;
2822     }
2823 
2824     {
2825         WORD32 i4_i;
2826         WORD32 part_id;
2827         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2828         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2829         {
2830             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2831             if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2832             {
2833                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2834                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2835                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2836 
2837                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2838             }
2839             if(ps_mv_refine_ctxt->i2_stim_injected_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2840             {
2841                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2842                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2843                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2844 
2845                 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2846             }
2847         }
2848     }
2849 }
2850 
hme_calc_sad_and_1_best_result(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2851 void hme_calc_sad_and_1_best_result(
2852     hme_search_prms_t *ps_search_prms,
2853     wgt_pred_ctxt_t *ps_wt_inp_prms,
2854     err_prms_t *ps_err_prms,
2855     result_upd_prms_t *ps_result_prms,
2856     U08 **ppu1_ref,
2857     S32 i4_ref_stride)
2858 {
2859     S32 i4_candt;
2860     S32 i4_inp_off;
2861     S32 i4_ref_offset;
2862     S32 i4_num_nodes;
2863 
2864     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2865     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2866     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2867     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2868     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2869 
2870     mv_refine_ctxt_t *ps_mv_refine_ctxt;
2871     search_node_t *ps_search_node;
2872 
2873     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2874     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2875     i4_inp_off = ps_search_prms->i4_cu_x_off;
2876     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2877     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2878     ps_search_node = ps_search_prms->ps_search_nodes;
2879 
2880     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2881     {
2882         /**********************************************************************/
2883         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2884         /**********************************************************************/
2885         {
2886             WORD32 b, c, d;
2887             UWORD8 *pu1_cur_ptr;
2888             UWORD8 *pu1_ref_ptr;
2889             UWORD16 au2_4x4_sad[NUM_4X4];
2890 
2891             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2892             {
2893                 continue;
2894             }
2895 
2896             ps_err_prms->pu1_inp =
2897                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2898             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2899             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2900             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2901 
2902             pu1_cur_ptr = ps_err_prms->pu1_inp;
2903             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2904 
2905             /* Loop to compute the SAD's */
2906             {
2907                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2908                 for(b = 0; b < NUM_4X4; b++)
2909                 {
2910                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2911                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2912 
2913                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2914                     {
2915                         WORD32 z_cur = (cur_buf_stride)*c + t1;
2916                         WORD32 z_ref = (ref_buf_stride)*c + t2;
2917                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2918                         {
2919                             au2_4x4_sad[b] += (UWORD16)ABS((
2920                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2921                         }
2922                     }
2923                 }
2924 
2925                 pi4_sad_grid[PART_ID_NxN_TL] =
2926                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2927                 pi4_sad_grid[PART_ID_NxN_TR] =
2928                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2929                 pi4_sad_grid[PART_ID_NxN_BL] =
2930                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2931                 pi4_sad_grid[PART_ID_NxN_BR] =
2932                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2933                 pi4_sad_grid[PART_ID_Nx2N_L] =
2934                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2935                 pi4_sad_grid[PART_ID_Nx2N_R] =
2936                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2937                 pi4_sad_grid[PART_ID_2NxN_T] =
2938                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2939                 pi4_sad_grid[PART_ID_2NxN_B] =
2940                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2941                 pi4_sad_grid[PART_ID_nLx2N_L] =
2942                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2943                 pi4_sad_grid[PART_ID_nRx2N_R] =
2944                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2945                 pi4_sad_grid[PART_ID_2NxnU_T] =
2946                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2947                 pi4_sad_grid[PART_ID_2NxnD_B] =
2948                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2949                 pi4_sad_grid[PART_ID_2Nx2N] =
2950                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2951                 pi4_sad_grid[PART_ID_2NxnU_B] =
2952                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2953                 pi4_sad_grid[PART_ID_2NxnD_T] =
2954                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2955                 pi4_sad_grid[PART_ID_nRx2N_L] =
2956                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2957                 pi4_sad_grid[PART_ID_nLx2N_R] =
2958                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2959             }
2960         }
2961 
2962         {
2963             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2964             S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2965             S32 best_node_cost;
2966             S32 second_best_node_cost;
2967 
2968             {
2969                 S16 mvdx1, mvdy1;
2970                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2971                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2972                 S32 pred_lx = i4_search_idx;
2973 
2974                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2975                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2976                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2977 
2978                 S32 inp_shift = 2;
2979                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2980                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2981                 S32 lambda = ps_pred_ctxt->lambda;
2982                 S32 rnd = 1 << (lambda_q_shift - 1);
2983                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2984                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2985                 S32 ref_bits =
2986                     ps_pred_ctxt
2987                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2988 
2989                 COMPUTE_DIFF_MV(
2990                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2991 
2992                 mvdx1 = ABS(mvdx1);
2993                 mvdy1 = ABS(mvdy1);
2994 
2995                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2996                              (mvdy1 > 0) + ref_bits + 2;
2997 
2998                 i4_mv_cost *= lambda;
2999                 i4_mv_cost += rnd;
3000                 i4_mv_cost >>= lambda_q_shift;
3001 
3002                 i4_mv_cost = CLIP_U16(i4_mv_cost);
3003             }
3004 
3005             /*For each valid partition, update the refine_prm structure to reflect the best and second
3006             best candidates for that partition*/
3007 
3008             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
3009             {
3010                 S32 update_required = 0;
3011                 S32 part_id = pi4_valid_part_ids[i4_count];
3012                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3013 
3014                 /*Calculate total cost*/
3015                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3016                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3017 
3018                 /*****************************************************************/
3019                 /* We do not labor through the results if the total cost worse   */
3020                 /* than the last of the results.                                 */
3021                 /*****************************************************************/
3022                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
3023                 second_best_node_cost = SHRT_MAX;
3024 
3025                 if(i4_tot_cost < second_best_node_cost)
3026                 {
3027                     update_required = 0;
3028 
3029                     /*************************************************************/
3030                     /* Identify where the current result isto be placed.Basically*/
3031                     /* find the node which has cost just higher thannodeundertest*/
3032                     /*************************************************************/
3033                     if(i4_tot_cost < best_node_cost)
3034                     {
3035                         update_required = 1;
3036                     }
3037                     else if(i4_tot_cost == best_node_cost)
3038                     {
3039                         update_required = 0;
3040                     }
3041 
3042                     if(update_required == 2)
3043                     {
3044                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3045                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3046                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3047                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3048                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3049                     }
3050                     else if(update_required == 1)
3051                     {
3052                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3053                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3054                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3055                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3056                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3057                     }
3058                 }
3059             }
3060         }
3061         ps_search_node++;
3062     }
3063 
3064     {
3065         WORD32 i4_i;
3066         WORD32 part_id;
3067         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3068         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3069         {
3070             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3071             if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3072             {
3073                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3074                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3075                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3076 
3077                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3078             }
3079         }
3080     }
3081 }
3082 
hme_calc_stim_injected_sad_and_1_best_result(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)3083 void hme_calc_stim_injected_sad_and_1_best_result(
3084     hme_search_prms_t *ps_search_prms,
3085     wgt_pred_ctxt_t *ps_wt_inp_prms,
3086     err_prms_t *ps_err_prms,
3087     result_upd_prms_t *ps_result_prms,
3088     U08 **ppu1_ref,
3089     S32 i4_ref_stride)
3090 {
3091     mv_refine_ctxt_t *ps_mv_refine_ctxt;
3092     search_node_t *ps_search_node;
3093 
3094     S32 i4_candt;
3095     S32 i4_count;
3096     S32 i4_inp_off;
3097     S32 i4_ref_offset;
3098     S32 i4_num_nodes;
3099     ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
3100         au8_final_ref_sigmaXSquared[17];
3101     UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
3102     S32 *pi4_valid_part_ids;
3103 
3104     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3105     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3106     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3107     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3108     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3109 
3110     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
3111     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3112     i4_inp_off = ps_search_prms->i4_cu_x_off;
3113     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
3114     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3115     ps_search_node = ps_search_prms->ps_search_nodes;
3116     pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
3117 
3118     /* Set local pointer to point to partition level sigma values calculated in hme_refine */
3119     au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
3120     au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
3121 
3122     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3123     {
3124         {
3125             WORD32 b, c, d;
3126             UWORD8 *pu1_cur_ptr;
3127             UWORD8 *pu1_ref_ptr;
3128             UWORD16 au2_4x4_sad[NUM_4X4];
3129 
3130             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3131             {
3132                 continue;
3133             }
3134 
3135             ps_err_prms->pu1_inp =
3136                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3137             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3138             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3139             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3140 
3141             pu1_cur_ptr = ps_err_prms->pu1_inp;
3142             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3143 
3144             /* Loop to compute the SAD's */
3145             {
3146                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3147                 for(b = 0; b < NUM_4X4; b++)
3148                 {
3149                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3150                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3151 
3152                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3153                     {
3154                         WORD32 z_cur = (cur_buf_stride)*c + t1;
3155                         WORD32 z_ref = (ref_buf_stride)*c + t2;
3156                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3157                         {
3158                             au2_4x4_sad[b] += (UWORD16)ABS((
3159                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3160                         }
3161                     }
3162                 }
3163 
3164                 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
3165                 hme_compute_sigmaX_and_sigmaXSquared(
3166                     pu1_ref_ptr,
3167                     ref_buf_stride,
3168                     au4_4x4_ref_sigmaX,
3169                     au4_4x4_ref_sigmaXSquared,
3170                     4,
3171                     4,
3172                     16,
3173                     16,
3174                     1,
3175                     4);
3176 
3177                 pi4_sad_grid[PART_ID_NxN_TL] =
3178                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3179                 pi4_sad_grid[PART_ID_NxN_TR] =
3180                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3181                 pi4_sad_grid[PART_ID_NxN_BL] =
3182                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3183                 pi4_sad_grid[PART_ID_NxN_BR] =
3184                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3185                 pi4_sad_grid[PART_ID_Nx2N_L] =
3186                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3187                 pi4_sad_grid[PART_ID_Nx2N_R] =
3188                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3189                 pi4_sad_grid[PART_ID_2NxN_T] =
3190                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3191                 pi4_sad_grid[PART_ID_2NxN_B] =
3192                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3193                 pi4_sad_grid[PART_ID_nLx2N_L] =
3194                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3195                 pi4_sad_grid[PART_ID_nRx2N_R] =
3196                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3197                 pi4_sad_grid[PART_ID_2NxnU_T] =
3198                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3199                 pi4_sad_grid[PART_ID_2NxnD_B] =
3200                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3201                 pi4_sad_grid[PART_ID_2Nx2N] =
3202                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3203                 pi4_sad_grid[PART_ID_2NxnU_B] =
3204                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3205                 pi4_sad_grid[PART_ID_2NxnD_T] =
3206                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3207                 pi4_sad_grid[PART_ID_nRx2N_L] =
3208                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3209                 pi4_sad_grid[PART_ID_nLx2N_R] =
3210                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3211             }
3212         }
3213 
3214         {
3215             S32 i4_sad, i4_mv_cost, i4_tot_cost;
3216             S32 best_node_cost;
3217             S32 second_best_node_cost;
3218             ULWORD64 u8_temp_var, u8_temp_var1;
3219             ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
3220 
3221             {
3222                 S16 mvdx1, mvdy1;
3223                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
3224                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
3225                 S32 pred_lx = i4_search_idx;
3226 
3227                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
3228                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
3229                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
3230 
3231                 S32 inp_shift = 2;
3232                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3233                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
3234                 S32 lambda = ps_pred_ctxt->lambda;
3235                 S32 rnd = 1 << (lambda_q_shift - 1);
3236                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3237                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3238                 S32 ref_bits =
3239                     ps_pred_ctxt
3240                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
3241 
3242                 COMPUTE_DIFF_MV(
3243                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
3244 
3245                 mvdx1 = ABS(mvdx1);
3246                 mvdy1 = ABS(mvdy1);
3247 
3248                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
3249                              (mvdy1 > 0) + ref_bits + 2;
3250 
3251                 i4_mv_cost *= lambda;
3252                 i4_mv_cost += rnd;
3253                 i4_mv_cost >>= lambda_q_shift;
3254 
3255                 i4_mv_cost = CLIP_U16(i4_mv_cost);
3256             }
3257 
3258             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
3259             {
3260                 S32 i4_stim_injected_sad;
3261                 S32 i4_stim_injected_cost;
3262                 S32 i4_noise_term;
3263                 unsigned long u4_shift_val;
3264                 S32 i4_bits_req;
3265 
3266                 S32 update_required = 0;
3267                 S32 part_id = pi4_valid_part_ids[i4_count];
3268                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3269 
3270                 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
3271 
3272                 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
3273 
3274                 if(ps_search_prms->i4_alpha_stim_multiplier)
3275                 {
3276                     /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
3277                     hme_compute_final_sigma_of_pu_from_base_blocks(
3278                         au4_4x4_ref_sigmaX,
3279                         au4_4x4_ref_sigmaXSquared,
3280                         au8_final_ref_sigmaX,
3281                         au8_final_ref_sigmaXSquared,
3282                         16,
3283                         4,
3284                         part_id,
3285                         4);
3286 
3287                     u8_ref_X_Square =
3288                         (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
3289                     u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
3290 
3291                     /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
3292                     /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
3293                     /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
3294                     u4_shift_val = ihevce_calc_stim_injected_variance(
3295                         au8_final_src_sigmaX,
3296                         au8_final_src_sigmaXSquared,
3297                         &u8_src_var,
3298                         i4_inv_wt,
3299                         ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
3300                         ps_wt_inp_prms->wpred_log_wdc,
3301                         part_id);
3302 
3303                     u8_ref_var = u8_ref_var >> u4_shift_val;
3304 
3305                     /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
3306                     GETRANGE64(i4_bits_req, u8_ref_var);
3307 
3308                     if(i4_bits_req > 27)
3309                     {
3310                         u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
3311                         u8_src_var = u8_src_var >> (i4_bits_req - 27);
3312                     }
3313 
3314                     if(u8_src_var == u8_ref_var)
3315                     {
3316                         u8_temp_var = (1 << STIM_Q_FORMAT);
3317                     }
3318                     else
3319                     {
3320                         u8_temp_var = (2 * u8_src_var * u8_ref_var);
3321                         u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
3322                         u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
3323                         u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
3324                         u8_temp_var = (u8_temp_var / u8_temp_var1);
3325                     }
3326 
3327                     i4_noise_term = (UWORD32)u8_temp_var;
3328 
3329                     ASSERT(i4_noise_term >= 0);
3330 
3331                     i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
3332                 }
3333                 else
3334                 {
3335                     i4_noise_term = 0;
3336                 }
3337                 u8_pure_dist = pi4_sad_grid[part_id];
3338                 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
3339                 u8_pure_dist += (1 << ((i4_q_level)-1));
3340                 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
3341 
3342                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3343                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3344                 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
3345                 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
3346 
3347                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
3348                 second_best_node_cost = SHRT_MAX;
3349 
3350                 if(i4_stim_injected_cost < second_best_node_cost)
3351                 {
3352                     update_required = 0;
3353 
3354                     if(i4_stim_injected_cost < best_node_cost)
3355                     {
3356                         update_required = 1;
3357                     }
3358                     else if(i4_stim_injected_cost == best_node_cost)
3359                     {
3360                         update_required = 0;
3361                     }
3362 
3363                     if(update_required == 2)
3364                     {
3365                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3366                         ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
3367                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3368                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3369                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3370                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3371                     }
3372                     else if(update_required == 1)
3373                     {
3374                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3375                         ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
3376                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3377                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3378                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3379                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3380                     }
3381                 }
3382             }
3383         }
3384 
3385         ps_search_node++;
3386     }
3387 
3388     {
3389         WORD32 i4_i;
3390         WORD32 part_id;
3391         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3392         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3393         {
3394             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3395             if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3396             {
3397                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3398                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3399                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3400 
3401                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3402             }
3403         }
3404     }
3405 }
3406 
hme_calc_sad_and_1_best_result_subpel(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)3407 void hme_calc_sad_and_1_best_result_subpel(
3408     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
3409 {
3410     S32 i4_candt;
3411     S32 i4_num_nodes;
3412 
3413     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3414 
3415     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3416     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3417     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3418     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3419 
3420     mv_refine_ctxt_t *ps_subpel_refine_ctxt;
3421     ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
3422     i4_num_nodes = 1;
3423 
3424     /* Run through each of the candts in a loop */
3425     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3426     {
3427         /**********************************************************************/
3428         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3429         /**********************************************************************/
3430         {
3431             WORD32 b, c, d;
3432             UWORD8 *pu1_cur_ptr;
3433             UWORD8 *pu1_ref_ptr;
3434             UWORD16 au2_4x4_sad[NUM_4X4];
3435 
3436             pu1_cur_ptr = ps_err_prms->pu1_inp;
3437             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3438 
3439             /* Loop to compute the SAD's */
3440             {
3441                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3442                 for(b = 0; b < NUM_4X4; b++)
3443                 {
3444                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3445                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3446 
3447                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3448                     {
3449                         WORD32 z_cur = (cur_buf_stride)*c + t1;
3450                         WORD32 z_ref = (ref_buf_stride)*c + t2;
3451                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3452                         {
3453                             au2_4x4_sad[b] += (UWORD16)ABS((
3454                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3455                         }
3456                     }
3457                 }
3458 
3459                 pi4_sad_grid[PART_ID_NxN_TL] =
3460                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3461                 pi4_sad_grid[PART_ID_NxN_TR] =
3462                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3463                 pi4_sad_grid[PART_ID_NxN_BL] =
3464                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3465                 pi4_sad_grid[PART_ID_NxN_BR] =
3466                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3467                 pi4_sad_grid[PART_ID_Nx2N_L] =
3468                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3469                 pi4_sad_grid[PART_ID_Nx2N_R] =
3470                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3471                 pi4_sad_grid[PART_ID_2NxN_T] =
3472                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3473                 pi4_sad_grid[PART_ID_2NxN_B] =
3474                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3475                 pi4_sad_grid[PART_ID_nLx2N_L] =
3476                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3477                 pi4_sad_grid[PART_ID_nRx2N_R] =
3478                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3479                 pi4_sad_grid[PART_ID_2NxnU_T] =
3480                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3481                 pi4_sad_grid[PART_ID_2NxnD_B] =
3482                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3483                 pi4_sad_grid[PART_ID_2Nx2N] =
3484                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3485                 pi4_sad_grid[PART_ID_2NxnU_B] =
3486                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3487                 pi4_sad_grid[PART_ID_2NxnD_T] =
3488                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3489                 pi4_sad_grid[PART_ID_nRx2N_L] =
3490                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3491                 pi4_sad_grid[PART_ID_nLx2N_R] =
3492                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3493             }
3494         }
3495         /**********************************************************************/
3496         /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3497         /**********************************************************************/
3498         {
3499             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
3500             S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
3501             S32 best_node_cost;
3502             S32 second_best_node_cost;
3503 
3504             /*For each valid partition, update the refine_prm structure to reflect the best and second
3505             best candidates for that partition*/
3506 
3507             for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
3508             {
3509                 S32 update_required = 0;
3510                 S32 part_id = pi4_valid_part_ids[i4_count];
3511                 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3512 
3513                 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
3514                 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3515 
3516                 /*Calculate total cost*/
3517                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3518                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3519 
3520                 /*****************************************************************/
3521                 /* We do not labor through the results if the total cost worse   */
3522                 /* than the last of the results.                                 */
3523                 /*****************************************************************/
3524                 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
3525                 second_best_node_cost = SHRT_MAX;
3526 
3527                 if(i4_tot_cost < second_best_node_cost)
3528                 {
3529                     update_required = 0;
3530 
3531                     /*************************************************************/
3532                     /* Identify where the current result isto be placed.Basically*/
3533                     /* find the node which has cost just higher thannodeundertest*/
3534                     /*************************************************************/
3535                     if(i4_tot_cost < best_node_cost)
3536                     {
3537                         update_required = 1;
3538                     }
3539                     else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
3540                     {
3541                         update_required = 0;
3542                     }
3543                     if(update_required == 2)
3544                     {
3545                         ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3546                         ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3547                         ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
3548                         ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
3549                         ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
3550                     }
3551                     else if(update_required == 1)
3552                     {
3553                         ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3554                         ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3555                         ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
3556                         ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
3557                         ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
3558                     }
3559                 }
3560             }
3561         }
3562     }
3563 
3564     {
3565         WORD32 i4_count = 0;
3566         for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
3567         {
3568             if(ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] >= MAX_SIGNED_16BIT_VAL)
3569             {
3570                 ps_subpel_refine_ctxt->ai2_fullpel_satd[0][i4_count] = MAX_SIGNED_16BIT_VAL;
3571             }
3572         }
3573     }
3574 }
3575 
3576 /**
3577 ********************************************************************************
3578 *  @fn     hme_calc_pt_sad_and_result_explicit(hme_search_prms_t *ps_search_prms,
3579 *                                              wgt_pred_ctxt_t *ps_wt_inp_prms,
3580 *                                              err_prms_t *ps_err_prms,
3581 *                                              result_upd_prms_t *ps_result_prms,
3582 *                                              U08 **ppu1_ref,
3583 *                                              S32 i4_ref_stride)
3584 *
3585 *  @brief   Run thorugh the provided candidates and compute the point SAD and
3586 *           cost and update the results in the order
3587 *
3588 *  @param[in]  ps_search_prms
3589 *  @param[in]  ps_wt_inp_prms
3590 *  @param[in]  ps_err_prms
3591 *  @param[out] ps_result_prms
3592 *  @param[in]  ppu1_ref
3593 *  @param[in]  i4_ref_stride
3594 *
3595 *  @return   None
3596 ********************************************************************************
3597 */
3598 
hme_calc_pt_sad_and_result_explicit(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)3599 void hme_calc_pt_sad_and_result_explicit(
3600     hme_search_prms_t *ps_search_prms,
3601     wgt_pred_ctxt_t *ps_wt_inp_prms,
3602     err_prms_t *ps_err_prms,
3603     result_upd_prms_t *ps_result_prms,
3604     U08 **ppu1_ref,
3605     S32 i4_ref_stride)
3606 {
3607     WORD32 i4_grid_mask, i4_part_mask, i4_num_results, i4_candt, i4_num_nodes;
3608     WORD32 i4_inp_stride, i4_inp_off, i4_ref_offset;
3609 
3610     search_node_t *ps_search_node;
3611     BLK_SIZE_T e_blk_size;
3612     PF_SAD_FXN_T pf_sad_fxn;
3613     PF_RESULT_FXN_T pf_hme_result_fxn;
3614 
3615     i4_grid_mask = 0x1; /* Point SAD */
3616 
3617     /* Get the parameters required */
3618     i4_part_mask = ps_search_prms->i4_part_mask;
3619     e_blk_size = ps_search_prms->e_blk_size;
3620     i4_num_results = (S32)ps_search_prms->ps_search_results->u1_num_results_per_part;
3621     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3622     ps_search_node = ps_search_prms->ps_search_nodes;
3623 
3624     i4_inp_stride = ps_search_prms->i4_inp_stride;
3625     /* Move to the location of the search blk in inp buffer */
3626     i4_inp_off = ps_search_prms->i4_cu_x_off;
3627     i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride;
3628     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3629 
3630     pf_sad_fxn = hme_get_sad_fxn(e_blk_size, i4_grid_mask, i4_part_mask);
3631     /**********************************************************************/
3632     /* we have a sparsely populated SAD grid of size 9x17.                */
3633     /* the id of the results in the grid is shown                         */
3634     /*     5   2   6                                                      */
3635     /*     1   0   3                                                      */
3636     /*     7   4   8                                                      */
3637     /* The motivation for choosing a grid like this is that               */
3638     /* in case of no refinement, the central location is                  */
3639     /* the first entry in the grid                                        */
3640     /* Also for diamond, the 4 entries get considered first               */
3641     /* This is consistent with the diamond notation used in               */
3642     /* subpel refinement. To Check                                        */
3643     /* Update the results for the given search candt                      */
3644     /* returns the cost of the 2Nx2N partition                            */
3645     /**********************************************************************/
3646 
3647     /* Get the modified update result fun. with CLIP16 of cost to match   */
3648     /* with SIMD */
3649     pf_hme_result_fxn = hme_update_results_grid_pu_bestn_no_encode;
3650 
3651     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3652     {
3653         if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3654             continue;
3655 
3656         /* initialize minimum cost for this candidate. As we search around */
3657         /* this candidate, this is used to check early exit, when in any   */
3658         /* given iteration, the center pt of the grid is lowest value      */
3659         ps_result_prms->i4_min_cost = MAX_32BIT_VAL;
3660 
3661         ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3662         ps_err_prms->i4_grid_mask = i4_grid_mask;
3663 
3664         ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3665         ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3666         ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3667 
3668         /**********************************************************************/
3669         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3670         /**********************************************************************/
3671         pf_sad_fxn(ps_err_prms);
3672 
3673         /**********************************************************************/
3674         /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3675         /**********************************************************************/
3676         ps_result_prms->i4_grid_mask = i4_grid_mask;
3677         ps_result_prms->ps_search_node_base = ps_search_node;
3678         pf_hme_result_fxn(ps_result_prms);
3679 
3680         ps_search_node++;
3681     }
3682 }
3683 
3684 /**
3685 ********************************************************************************
3686 *  @fn     hme_set_mvp_node(search_results_t *ps_search_results,
3687 *                           search_node_t *ps_candt_prj_coloc,
3688 *                           S08 i1_ref_idx)
3689 *
3690 *  @brief   Set node used for motion vector predictor computation
3691 *           Either TR or L is compared to projected colocated and
3692 *           closest is decided as MVP
3693 *
3694 *  @param[in]  ps_search_results
3695 *
3696 *  @param[in]  ps_candt_prj_coloc
3697 *
3698 *  @param[in]  i1_ref_idx
3699 *
3700 *  @return   None
3701 ********************************************************************************
3702 */
hme_set_mvp_node(search_results_t * ps_search_results,search_node_t * ps_candt_prj_coloc,U08 u1_pred_lx,U08 u1_default_ref_id)3703 void hme_set_mvp_node(
3704     search_results_t *ps_search_results,
3705     search_node_t *ps_candt_prj_coloc,
3706     U08 u1_pred_lx,
3707     U08 u1_default_ref_id)
3708 {
3709     S32 i;
3710     pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[u1_pred_lx];
3711     pred_candt_nodes_t *ps_pred_nodes = ps_pred_ctxt->as_pred_nodes;
3712     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
3713 
3714     S32 inp_shift = 2;
3715     S32 pred_shift;
3716     S32 ref_bits;
3717     S32 mv_p_x, mv_p_y;
3718     S16 mvdx1, mvdx2, mvdy1, mvdy2;
3719 
3720     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[u1_pred_lx][u1_default_ref_id];
3721 
3722     /*************************************************************************/
3723     /* Priority to bottom left availability. Else we go to left. If both are */
3724     /* not available, then a remains null                                    */
3725     /*************************************************************************/
3726     if(ps_pred_nodes->ps_l->u1_is_avail)
3727     {
3728         ps_pred_node_a = ps_pred_nodes->ps_l;
3729     }
3730 
3731     if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
3732     {
3733         ps_pred_node_b = ps_pred_nodes->ps_tr;
3734     }
3735     else
3736     {
3737         ps_pred_node_b = ps_pred_nodes->ps_coloc;
3738         ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3739     }
3740 
3741     if(ps_pred_node_a == NULL)
3742     {
3743         ps_pred_node_a = ps_pred_nodes->ps_coloc;
3744         ps_pred_node_a->s_mv = ps_pred_node_a->ps_mv[0];
3745 
3746         if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
3747         {
3748             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
3749             ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3750         }
3751     }
3752 
3753     if(ps_pred_node_a->i1_ref_idx != u1_default_ref_id)
3754     {
3755         SCALE_FOR_POC_DELTA(
3756             mv_p_x, mv_p_y, ps_pred_node_a, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3757     }
3758     else
3759     {
3760         mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3761         mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3762     }
3763     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3764     COMPUTE_MV_DIFFERENCE(mvdx1, mvdy1, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3765     mvdx1 = ABS(mvdx1);
3766     mvdy1 = ABS(mvdy1);
3767 
3768     if(ps_pred_node_b->i1_ref_idx != u1_default_ref_id)
3769     {
3770         SCALE_FOR_POC_DELTA(
3771             mv_p_x, mv_p_y, ps_pred_node_b, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3772     }
3773     else
3774     {
3775         mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
3776         mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
3777     }
3778     pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
3779     COMPUTE_MV_DIFFERENCE(mvdx2, mvdy2, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3780     mvdx2 = ABS(mvdx2);
3781     mvdy2 = ABS(mvdy2);
3782 
3783     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
3784     {
3785         for(i = 0; i < TOT_NUM_PARTS; i++)
3786         {
3787             ps_pred_nodes[i].ps_mvp_node = ps_pred_node_a;
3788         }
3789     }
3790     else
3791     {
3792         for(i = 0; i < TOT_NUM_PARTS; i++)
3793         {
3794             ps_pred_nodes[i].ps_mvp_node = ps_pred_node_b;
3795         }
3796     }
3797 }
3798