1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 *******************************************************************************
23 * @file
24 *  ih264e_intra_modes_eval.c
25 *
26 * @brief
27 *  This file contains definitions of routines that perform rate distortion
28 *  analysis on a macroblock if they are to be coded as intra.
29 *
30 * @author
31 *  ittiam
32 *
33 * @par List of Functions:
34 *  - ih264e_derive_neighbor_availability_of_mbs()
35 *  - ih264e_derive_ngbr_avbl_of_mb_partitions()
36 *  - ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff()
37 *  - ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff()
38 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff()
39 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton()
40 *  - ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff()
41 *  - ih264e_evaluate_intra16x16_modes()
42 *  - ih264e_evaluate_intra4x4_modes()
43 *  - ih264e_evaluate_intra_chroma_modes()
44 *
45 * @remarks
46 *  None
47 *
48 *******************************************************************************
49 */
50 
51 /*****************************************************************************/
52 /* File Includes                                                             */
53 /*****************************************************************************/
54 
55 /* System include files */
56 #include <stdio.h>
57 #include <string.h>
58 #include <limits.h>
59 #include <assert.h>
60 
61 /* User include files */
62 #include "ih264e_config.h"
63 #include "ih264_typedefs.h"
64 #include "ih264e_defs.h"
65 #include "iv2.h"
66 #include "ive2.h"
67 #include "ih264_debug.h"
68 #include "ih264_defs.h"
69 #include "ih264_macros.h"
70 #include "ih264_intra_pred_filters.h"
71 #include "ih264_structs.h"
72 #include "ih264_common_tables.h"
73 #include "ih264_trans_quant_itrans_iquant.h"
74 #include "ih264_inter_pred_filters.h"
75 #include "ih264_mem_fns.h"
76 #include "ih264_padding.h"
77 #include "ih264_deblk_edge_filters.h"
78 #include "ih264_cabac_tables.h"
79 #include "ime_distortion_metrics.h"
80 #include "ih264e_error.h"
81 #include "ih264e_bitstream.h"
82 #include "ime_defs.h"
83 #include "ime_structs.h"
84 #include "irc_cntrl_param.h"
85 #include "irc_frame_info_collector.h"
86 #include "ih264e_rate_control.h"
87 #include "ih264e_cabac_structs.h"
88 #include "ih264e_structs.h"
89 #include "ih264e_intra_modes_eval.h"
90 #include "ih264e_globals.h"
91 #include "ime_platform_macros.h"
92 
93 
94 /*****************************************************************************/
95 /* Function Definitions                                                      */
96 /*****************************************************************************/
97 
98 /**
99 ******************************************************************************
100 *
101 * @brief
102 *  derivation process for macroblock availability
103 *
104 * @par   Description
105 *  Calculates the availability of the left, top, topright and topleft macroblocks.
106 *
107 * @param[in] ps_proc_ctxt
108 *  pointer to proc context (handle)
109 *
110 * @remarks Based on section 6.4.5 in H264 spec
111 *
112 * @return  none
113 *
114 ******************************************************************************
115 */
ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t * ps_proc)116 void ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t *ps_proc)
117 {
118     UWORD8 *pu1_slice_idx_curr = ps_proc->pu1_slice_idx;
119     UWORD8 *pu1_slice_idx_b;
120     UWORD8 *pu1_slice_idx_a;
121     UWORD8 *pu1_slice_idx_c;
122     UWORD8 *pu1_slice_idx_d;
123     block_neighbors_t *ps_ngbr_avbl;
124     WORD32 i4_mb_x, i4_mb_y;
125     WORD32 i4_wd_mbs;
126 
127     i4_mb_x = ps_proc->i4_mb_x;
128     i4_mb_y = ps_proc->i4_mb_y;
129 
130     i4_wd_mbs = ps_proc->i4_wd_mbs;
131 
132     pu1_slice_idx_curr += (i4_mb_y * i4_wd_mbs) + i4_mb_x;
133     pu1_slice_idx_a = pu1_slice_idx_curr - 1;
134     pu1_slice_idx_b = pu1_slice_idx_curr - i4_wd_mbs;
135     pu1_slice_idx_c = pu1_slice_idx_b + 1;
136     pu1_slice_idx_d = pu1_slice_idx_b - 1;
137     ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
138 
139     /**********************************************************************/
140     /* The macroblock is marked as available, unless one of the following */
141     /* conditions is true in which case the macroblock shall be marked as */
142     /* not available.                                                     */
143     /* 1. mbAddr < 0                                                      */
144     /* 2  mbAddr > CurrMbAddr                                             */
145     /* 3. the macroblock with address mbAddr belongs to a different slice */
146     /* than the macroblock with address CurrMbAddr                        */
147     /**********************************************************************/
148 
149     /* left macroblock availability */
150     if (i4_mb_x == 0)
151     { /* macroblocks along first column */
152         ps_ngbr_avbl->u1_mb_a = 0;
153     }
154     else
155     { /* macroblocks belong to same slice? */
156         if (*pu1_slice_idx_a != *pu1_slice_idx_curr)
157             ps_ngbr_avbl->u1_mb_a = 0;
158         else
159             ps_ngbr_avbl->u1_mb_a = 1;
160     }
161 
162     /* top macroblock availability */
163     if (i4_mb_y == 0)
164     { /* macroblocks along first row */
165         ps_ngbr_avbl->u1_mb_b = 0;
166     }
167     else
168     { /* macroblocks belong to same slice? */
169         if (*pu1_slice_idx_b != *pu1_slice_idx_curr)
170             ps_ngbr_avbl->u1_mb_b = 0;
171         else
172             ps_ngbr_avbl->u1_mb_b = 1;
173     }
174 
175     /* top right macroblock availability */
176     if (i4_mb_x == i4_wd_mbs-1 || i4_mb_y == 0)
177     { /* macroblocks along last column */
178         ps_ngbr_avbl->u1_mb_c = 0;
179     }
180     else
181     { /* macroblocks belong to same slice? */
182         if (*pu1_slice_idx_c != *pu1_slice_idx_curr)
183             ps_ngbr_avbl->u1_mb_c = 0;
184         else
185             ps_ngbr_avbl->u1_mb_c = 1;
186     }
187 
188     /* top left macroblock availability */
189     if (i4_mb_x == 0 || i4_mb_y == 0)
190     { /* macroblocks along first column */
191         ps_ngbr_avbl->u1_mb_d = 0;
192     }
193     else
194     { /* macroblocks belong to same slice? */
195         if (*pu1_slice_idx_d != *pu1_slice_idx_curr)
196             ps_ngbr_avbl->u1_mb_d = 0;
197         else
198             ps_ngbr_avbl->u1_mb_d = 1;
199     }
200 }
201 
202 /**
203 ******************************************************************************
204 *
205 * @brief
206 *  derivation process for subblock/partition availability
207 *
208 * @par   Description
209 *  Calculates the availability of the left, top, topright and topleft subblock
210 *  or partitions.
211 *
212 * @param[in]    ps_proc_ctxt
213 *  pointer to macroblock context (handle)
214 *
215 * @param[in]    i1_pel_pos_x
216 *  column position of the pel wrt the current block
217 *
218 * @param[in]    i1_pel_pos_y
219 *  row position of the pel in wrt current block
220 *
221 * @remarks     Assumptions: before calling this function it is assumed that
222 *   the neighbor availability of the current macroblock is already derived.
223 *   Based on table 6-3 of H264 specification
224 *
225 * @return      availability status (yes or no)
226 *
227 ******************************************************************************
228 */
ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t * ps_ngbr_avbl,WORD8 i1_pel_pos_x,WORD8 i1_pel_pos_y)229 UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t *ps_ngbr_avbl,
230                                                 WORD8 i1_pel_pos_x,
231                                                 WORD8 i1_pel_pos_y)
232 {
233     UWORD8 u1_neighbor_avail=0;
234 
235     /**********************************************************************/
236     /* values of i1_pel_pos_x in the range 0-15 inclusive correspond to   */
237     /* various columns of a macroblock                                    */
238     /*                                                                    */
239     /* values of i1_pel_pos_y in the range 0-15 inclusive correspond to   */
240     /* various rows of a macroblock                                       */
241     /*                                                                    */
242     /* other values of i1_pel_pos_x & i1_pel_pos_y represents elements    */
243     /* outside the bound of an mb ie., represents its neighbors.          */
244     /**********************************************************************/
245     if (i1_pel_pos_x < 0)
246     { /* column(-1) */
247         if (i1_pel_pos_y < 0)
248         { /* row(-1) */
249             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_d; /* current mb topleft availability */
250         }
251         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
252         { /* all rows of a macroblock */
253             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_a; /* current mb left availability */
254         }
255         else /* if (i1_pel_pos_y >= 16) */
256         { /* rows(+16) */
257             u1_neighbor_avail = 0;  /* current mb bottom left availability */
258         }
259     }
260     else if (i1_pel_pos_x >= 0 && i1_pel_pos_x < 16)
261     { /* all columns of a macroblock */
262         if (i1_pel_pos_y < 0)
263         { /* row(-1) */
264             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_b; /* current mb top availability */
265         }
266         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
267         { /* all rows of a macroblock */
268             u1_neighbor_avail = 1; /* current mb availability */
269             /* availability of the partition is dependent on the position of the partition inside the mb */
270             /* although the availability is declared as 1 in all cases these needs to be corrected somewhere else and this is not done in here */
271         }
272         else /* if (i1_pel_pos_y >= 16) */
273         { /* rows(+16) */
274             u1_neighbor_avail = 0;  /* current mb bottom availability */
275         }
276     }
277     else if (i1_pel_pos_x >= 16)
278     { /* column(+16) */
279         if (i1_pel_pos_y < 0)
280         { /* row(-1) */
281             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_c; /* current mb top right availability */
282         }
283         else /* if (i1_pel_pos_y >= 0) */
284         { /* all other rows */
285             u1_neighbor_avail = 0;  /* current mb right & bottom right availability */
286         }
287     }
288 
289     return u1_neighbor_avail;
290 }
291 
292 /**
293 ******************************************************************************
294 *
295 * @brief
296 *  evaluate best intra 16x16 mode (rate distortion opt off)
297 *
298 * @par Description
299 *  This function evaluates all the possible intra 16x16 modes and finds the mode
300 *  that best represents the macro-block (least distortion) and occupies fewer
301 *  bits in the bit-stream.
302 *
303 * @param[in]   ps_proc_ctxt
304 *  pointer to process context (handle)
305 *
306 * @remarks
307 *  Ideally the cost of encoding a macroblock is calculated as
308 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
309 *  input block and the reconstructed block and rate is the number of bits taken
310 *  to place the macroblock in the bit-stream. In this routine the rate does not
311 *  exactly point to the total number of bits it takes, rather it points to header
312 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
313 *  and residual bits fall in to texture bits the number of bits taken to encoding
314 *  mbtype is considered as rate, we compute cost. Further we will approximate
315 *  the distortion as the deviation b/w input and the predicted block as opposed
316 *  to input and reconstructed block.
317 *
318 *  NOTE: As per the Document JVT-O079, for intra 16x16 macroblock,
319 *  the SAD and cost are one and the same.
320 *
321 * @return     none
322 *
323 ******************************************************************************
324 */
325 
ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)326 void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
327 {
328     /* Codec Context */
329     codec_t *ps_codec = ps_proc->ps_codec;
330 
331     /* SAD(distortion metric) of an 8x8 block */
332     WORD32 i4_mb_distortion = INT_MAX, i4_mb_distortion_least = INT_MAX;
333 
334     /* lambda */
335     UWORD32 u4_lambda = ps_proc->u4_lambda;
336 
337     /* cost = distortion + lambda*rate */
338     WORD32 i4_mb_cost= INT_MAX, i4_mb_cost_least = INT_MAX;
339 
340     /* intra mode */
341     UWORD32 u4_intra_mode, u4_best_intra_16x16_mode = DC_I16x16;
342 
343     /* neighbor pels for intra prediction */
344     UWORD8 *pu1_ngbr_pels_i16 = ps_proc->au1_ngbr_pels;
345 
346     /* neighbor availability */
347     WORD32 i4_ngbr_avbl;
348 
349     /* pointer to src macro block */
350     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
351     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
352 
353     /* pointer to prediction macro block */
354     UWORD8 *pu1_pred_mb_intra_16x16 = ps_proc->pu1_pred_mb_intra_16x16;
355     UWORD8 *pu1_pred_mb_intra_16x16_plane = ps_proc->pu1_pred_mb_intra_16x16_plane;
356 
357     /* strides */
358     WORD32 i4_src_strd = ps_proc->i4_src_strd;
359     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
360     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
361 
362     /* pointer to neighbors left, top, topleft */
363     UWORD8 *pu1_mb_a = pu1_ref_mb - 1;
364     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd;
365     UWORD8 *pu1_mb_d = pu1_mb_b - 1;
366 
367     /* valid intra modes map */
368     UWORD32 u4_valid_intra_modes;
369 
370     /* lut for valid intra modes */
371     const UWORD8 u1_valid_intra_modes[8] = {4, 6, 12, 14, 5, 7, 13, 15};
372 
373     /* temp var */
374     UWORD32 i, u4_enable_fast_sad = 0, offset = 0;
375 
376     /* init temp var */
377     if (ps_proc->i4_slice_type != ISLICE)
378     {
379         /* Offset for MBtype */
380         offset = (ps_proc->i4_slice_type == PSLICE) ? 5 : 23;
381         u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad;
382     }
383 
384     /* locating neighbors that are available for prediction */
385     /* TODO : update the neighbor availability information basing on constrained intra pred information */
386     /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines
387      * basing on neighbors available and hence evade the computation of neighbor availability totally. */
388     /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
389     i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1);
390     ps_proc->i4_ngbr_avbl_16x16_mb = i4_ngbr_avbl;
391 
392     /* gather prediction pels from the neighbors, if particular set is not available
393      * it is set to zero*/
394     /* left pels */
395     if (ps_proc->ps_ngbr_avbl->u1_mb_a)
396     {
397         for(i = 0; i < 16; i++)
398             pu1_ngbr_pels_i16[16-1-i] = pu1_mb_a[i * i4_rec_strd];
399     }
400     else
401     {
402         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16,0,MB_SIZE);
403     }
404     /* top pels */
405     if (ps_proc->ps_ngbr_avbl->u1_mb_b)
406     {
407         ps_codec->pf_mem_cpy_mul8(pu1_ngbr_pels_i16+16+1,pu1_mb_b,16);
408         /*for(i = 0; i < 16; i++)
409             pu1_ngbr_pels_i16[16+1+i] = pu1_mb_b[i];*/
410     }
411     else
412     {
413         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16+16+1,0,MB_SIZE);
414     }
415     /* topleft pels */
416     if (ps_proc->ps_ngbr_avbl->u1_mb_d)
417         pu1_ngbr_pels_i16[16] = *pu1_mb_d;
418     else
419         pu1_ngbr_pels_i16[16] = 0;
420 
421     /* set valid intra modes for evaluation */
422 //    u4_valid_intra_modes = 15;
423 ////    ih264e_filter_intra16x16modes(pu1_mb_curr, i4_src_strd, &u4_valid_intra_modes);
424 //    if (!ps_proc->ps_ngbr_avbl->u1_mb_a)
425 //        u4_valid_intra_modes &= ~(1 << HORZ_I16x16);
426 //    if (!ps_proc->ps_ngbr_avbl->u1_mb_b)
427 //        u4_valid_intra_modes &= ~(1 << VERT_I16x16);
428 ////    if (!ps_proc->ps_ngbr_avbl->u1_mb_a || !ps_proc->ps_ngbr_avbl->u1_mb_b || !ps_proc->ps_ngbr_avbl->u1_mb_d)
429 //    if (i4_ngbr_avbl != 7)
430 //        u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
431 
432     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
433 
434     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
435         u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
436 
437     /* evaluate b/w HORZ_I16x16, VERT_I16x16 & DC_I16x16 */
438     ps_codec->pf_ih264e_evaluate_intra16x16_modes(pu1_curr_mb, pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16,
439                                                   i4_src_strd, i4_pred_strd,
440                                                   i4_ngbr_avbl, &u4_intra_mode, &i4_mb_distortion_least,
441                                                   u4_valid_intra_modes);
442 
443     /* cost = distortion + lambda*rate */
444     i4_mb_cost_least = i4_mb_distortion_least;
445 
446     if (( (u4_valid_intra_modes >> 3) & 1) != 0 && (ps_codec->s_cfg.u4_enc_speed_preset != IVE_FASTEST ||
447                     ps_proc->i4_slice_type == ISLICE))
448     {
449         /* intra prediction for PLANE mode*/
450         (ps_codec->apf_intra_pred_16_l)[PLANE_I16x16](pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16_plane, 0, i4_pred_strd, i4_ngbr_avbl);
451 
452         /* evaluate distortion between the actual blk and the estimated blk for the given mode */
453         ps_codec->apf_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_pred_mb_intra_16x16_plane, i4_src_strd, i4_pred_strd, i4_mb_cost_least, &i4_mb_distortion);
454 
455         /* cost = distortion + lambda*rate */
456         i4_mb_cost = i4_mb_distortion;
457 
458         /* update the least cost information if necessary */
459         if(i4_mb_cost < i4_mb_distortion_least)
460         {
461             u4_intra_mode = PLANE_I16x16;
462 
463             i4_mb_cost_least = i4_mb_cost;
464             i4_mb_distortion_least = i4_mb_distortion;
465         }
466     }
467 
468     u4_best_intra_16x16_mode = u4_intra_mode;
469 
470     DEBUG("%d partition cost, %d intra mode\n", i4_mb_cost_least * 32, u4_best_intra_16x16_mode);
471 
472     ps_proc->u1_l_i16_mode = u4_best_intra_16x16_mode;
473 
474     /* cost = distortion + lambda*rate */
475     i4_mb_cost_least    = i4_mb_distortion_least + u4_lambda*u1_uev_codelength[offset + u4_best_intra_16x16_mode];
476 
477 
478     /* update the type of the mb if necessary */
479     if (i4_mb_cost_least < ps_proc->i4_mb_cost)
480     {
481         ps_proc->i4_mb_cost = i4_mb_cost_least;
482         ps_proc->i4_mb_distortion = i4_mb_distortion_least;
483         ps_proc->u4_mb_type = I16x16;
484     }
485 
486     return ;
487 }
488 
489 
490 /**
491 ******************************************************************************
492 *
493 * @brief
494 *  evaluate best intra 8x8 mode (rate distortion opt on)
495 *
496 * @par Description
497 *  This function evaluates all the possible intra 8x8 modes and finds the mode
498 *  that best represents the macro-block (least distortion) and occupies fewer
499 *  bits in the bit-stream.
500 *
501 * @param[in]    ps_proc_ctxt
502 *  pointer to proc ctxt
503 *
504 * @remarks Ideally the cost of encoding a macroblock is calculated as
505 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
506 *  input block and the reconstructed block and rate is the number of bits taken
507 *  to place the macroblock in the bit-stream. In this routine the rate does not
508 *  exactly point to the total number of bits it takes, rather it points to header
509 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
510 *  and residual bits fall in to texture bits the number of bits taken to encoding
511 *  mbtype is considered as rate, we compute cost. Further we will approximate
512 *  the distortion as the deviation b/w input and the predicted block as opposed
513 *  to input and reconstructed block.
514 *
515 *  NOTE: TODO: This function needs to be tested
516 *
517 *  @return      none
518 *
519 ******************************************************************************
520 */
ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)521 void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
522 {
523     /* Codec Context */
524     codec_t *ps_codec = ps_proc->ps_codec;
525 
526     /* SAD(distortion metric) of an 4x4 block */
527     WORD32 i4_partition_distortion, i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
528 
529     /* lambda */
530     UWORD32 u4_lambda = ps_proc->u4_lambda;
531 
532     /* cost = distortion + lambda*rate */
533     WORD32 i4_partition_cost, i4_partition_cost_least, i4_total_cost = u4_lambda;
534 
535     /* cost due to mbtype */
536     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
537 
538     /* intra mode */
539     UWORD32 u4_intra_mode, u4_best_intra_8x8_mode = DC_I8x8, u4_estimated_intra_8x8_mode;
540 
541     /* neighbor pels for intra prediction */
542     UWORD8 *pu1_ngbr_pels_i8 = ps_proc->au1_ngbr_pels;
543 
544     /* pointer to curr partition */
545     UWORD8 *pu1_mb_curr;
546 
547     /* pointer to prediction macro block */
548     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
549 
550     /* strides */
551     WORD32 i4_src_strd = ps_proc->i4_src_strd;
552     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
553 
554     /* neighbors left, top, top right, top left */
555     UWORD8 *pu1_mb_a;
556     UWORD8 *pu1_mb_b;
557     UWORD8 *pu1_mb_d;
558 
559     /* neighbor availability */
560     WORD32 i4_ngbr_avbl;
561     block_neighbors_t s_ngbr_avbl;
562 
563     /* temp vars */
564     UWORD32  b8, u4_pix_x, u4_pix_y;
565 
566     /* ngbr mb syntax information */
567     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
568     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
569 
570     /* valid intra modes map */
571     UWORD32 u4_valid_intra_modes;
572 
573     for(b8 = 0; b8 < 4; b8++)
574     {
575         u4_pix_x = (b8 & 0x01) << 3;
576         u4_pix_y = (b8 >> 1) << 3;
577 
578         pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
579         /* when rdopt is off, we use the input as reference for constructing prediction buffer */
580         /* as opposed to using the recon pels. (open loop intra prediction) */
581         pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
582         pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
583         pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
584 
585         /* locating neighbors that are available for prediction */
586         /* TODO : update the neighbor availability information basing on constrained intra pred information */
587         /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
588         /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
589         s_ngbr_avbl.u1_mb_a = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x - 1, u4_pix_y); /* xD = -1, yD = 0 */
590         s_ngbr_avbl.u1_mb_b = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x, u4_pix_y - 1); /* xD = 0, yD = -1 */
591         s_ngbr_avbl.u1_mb_c = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x + 8, u4_pix_y - 1); /* xD = BLK_8x8_SIZE, yD = -1 */
592         s_ngbr_avbl.u1_mb_d = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x - 1, u4_pix_y - 1); /* xD = -1, yD = -1 */
593 
594         /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_c * TOP_RIGHT_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
595         i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  (s_ngbr_avbl.u1_mb_c << 3) +
596                         (s_ngbr_avbl.u1_mb_a << 4);
597         /* if top partition is available and top right is not available for intra prediction, then */
598         /* padd top right samples using top sample and make top right also available */
599         /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
600         ps_proc->ai4_neighbor_avail_8x8_subblks[b8] = i4_ngbr_avbl;
601 
602 
603         ih264_intra_pred_luma_8x8_mode_ref_filtering(pu1_mb_a, pu1_mb_b, pu1_mb_d, pu1_ngbr_pels_i8,
604                                                      i4_src_strd, i4_ngbr_avbl);
605 
606         i4_partition_cost_least = INT_MAX;
607         /* set valid intra modes for evaluation */
608         u4_valid_intra_modes = 0x1ff;
609 
610         if (!s_ngbr_avbl.u1_mb_b)
611         {
612             u4_valid_intra_modes &= ~(1 << VERT_I4x4);
613             u4_valid_intra_modes &= ~(1 << DIAG_DL_I4x4);
614             u4_valid_intra_modes &= ~(1 << VERT_L_I4x4);
615         }
616         if (!s_ngbr_avbl.u1_mb_a)
617         {
618             u4_valid_intra_modes &= ~(1 << HORZ_I4x4);
619             u4_valid_intra_modes &= ~(1 << HORZ_U_I4x4);
620         }
621         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b || !s_ngbr_avbl.u1_mb_d)
622         {
623             u4_valid_intra_modes &= ~(1 << DIAG_DR_I4x4);
624             u4_valid_intra_modes &= ~(1 << VERT_R_I4x4);
625             u4_valid_intra_modes &= ~(1 << HORZ_D_I4x4);
626         }
627 
628         /* estimate the intra 8x8 mode for the current partition (for evaluating cost) */
629         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
630         {
631             u4_estimated_intra_8x8_mode = DC_I8x8;
632         }
633         else
634         {
635             UWORD32 u4_left_intra_8x8_mode = DC_I8x8;
636             UWORD32 u4_top_intra_8x8_mode = DC_I8x8;
637 
638             if (u4_pix_x == 0)
639             {
640                 if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
641                 {
642                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[b8+1];
643                 }
644                 else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
645                 {
646                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[(b8+1)*4+2];
647                 }
648             }
649             else
650             {
651                 u4_left_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-1];
652             }
653 
654             if (u4_pix_y == 0)
655             {
656                 if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
657                 {
658                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[b8+2];
659                 }
660                 else if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
661                 {
662                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[(b8+2)*4+2];
663                 }
664             }
665             else
666             {
667                 u4_top_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-2];
668             }
669 
670             u4_estimated_intra_8x8_mode = MIN(u4_left_intra_8x8_mode, u4_top_intra_8x8_mode);
671         }
672 
673         /* perform intra mode 8x8 evaluation */
674         for (u4_intra_mode = VERT_I8x8; u4_valid_intra_modes != 0; u4_intra_mode++, u4_valid_intra_modes >>= 1)
675         {
676             if ( (u4_valid_intra_modes & 1) == 0)
677                 continue;
678 
679             /* intra prediction */
680             (ps_codec->apf_intra_pred_8_l)[u4_intra_mode](pu1_ngbr_pels_i8, pu1_pred_mb, 0, i4_pred_strd, i4_ngbr_avbl);
681 
682             /* evaluate distortion between the actual blk and the estimated blk for the given mode */
683             ime_compute_sad_8x8(pu1_mb_curr, pu1_pred_mb, i4_src_strd, i4_pred_strd, i4_partition_cost_least, &i4_partition_distortion);
684 
685             i4_partition_cost = i4_partition_distortion + ((u4_estimated_intra_8x8_mode == u4_intra_mode)?u4_cost_one_bit:u4_cost_four_bits);
686 
687             /* update the least cost information if necessary */
688             if (i4_partition_cost < i4_partition_cost_least)
689             {
690                 i4_partition_cost_least = i4_partition_cost;
691                 i4_partition_distortion_least = i4_partition_distortion;
692                 u4_best_intra_8x8_mode = u4_intra_mode;
693             }
694         }
695         /* macroblock distortion */
696         i4_total_cost += i4_partition_cost_least;
697         i4_total_distortion += i4_partition_distortion_least;
698         /* mb partition mode */
699         ps_proc->au1_intra_luma_mb_8x8_modes[b8] = u4_best_intra_8x8_mode;
700 
701     }
702 
703     /* update the type of the mb if necessary */
704     if (i4_total_cost < ps_proc->i4_mb_cost)
705     {
706         ps_proc->i4_mb_cost = i4_total_cost;
707         ps_proc->i4_mb_distortion = i4_total_distortion;
708         ps_proc->u4_mb_type = I8x8;
709     }
710 
711     return ;
712 }
713 
714 
715 /**
716 ******************************************************************************
717 *
718 * @brief
719 *  evaluate best intra 4x4 mode (rate distortion opt off)
720 *
721 * @par Description
722 *  This function evaluates all the possible intra 4x4 modes and finds the mode
723 *  that best represents the macro-block (least distortion) and occupies fewer
724 *  bits in the bit-stream.
725 *
726 * @param[in]    ps_proc_ctxt
727 *  pointer to proc ctxt
728 *
729 * @remarks
730 *  Ideally the cost of encoding a macroblock is calculated as
731 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
732 *  input block and the reconstructed block and rate is the number of bits taken
733 *  to place the macroblock in the bit-stream. In this routine the rate does not
734 *  exactly point to the total number of bits it takes, rather it points to header
735 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
736 *  and residual bits fall in to texture bits the number of bits taken to encoding
737 *  mbtype is considered as rate, we compute cost. Further we will approximate
738 *  the distortion as the deviation b/w input and the predicted block as opposed
739 *  to input and reconstructed block.
740 *
741 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
742 *  24*lambda is added to the SAD before comparison with the best SAD for
743 *  inter prediction. This is an empirical value to prevent using too many intra
744 *  blocks.
745 *
746 * @return      none
747 *
748 ******************************************************************************
749 */
ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)750 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
751 {
752     /* Codec Context */
753     codec_t *ps_codec = ps_proc->ps_codec;
754 
755     /* SAD(distortion metric) of an 4x4 block */
756     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
757 
758     /* lambda */
759     UWORD32 u4_lambda = ps_proc->u4_lambda;
760 
761     /* cost = distortion + lambda*rate */
762     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
763 
764     /* cost due to mbtype */
765     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
766 
767     /* intra mode */
768     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
769 
770     /* neighbor pels for intra prediction */
771     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
772 
773     /* pointer to curr partition */
774     UWORD8 *pu1_mb_curr;
775 
776     /* pointer to prediction macro block */
777     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
778 
779     /* strides */
780     WORD32 i4_src_strd = ps_proc->i4_src_strd;
781     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
782 
783     /* neighbors left, top, top right, top left */
784     UWORD8 *pu1_mb_a;
785     UWORD8 *pu1_mb_b;
786     UWORD8 *pu1_mb_c;
787     UWORD8 *pu1_mb_d;
788 
789     /* neighbor availability */
790     WORD32 i4_ngbr_avbl;
791     block_neighbors_t s_ngbr_avbl;
792 
793     /* temp vars */
794     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
795 
796     /* scan order inside 4x4 block */
797     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
798 
799     /* ngbr sub mb modes */
800     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
801     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
802 
803     /* valid intra modes map */
804     UWORD32 u4_valid_intra_modes;
805     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
806 
807     i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_c << 3);
808     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
809 
810     for (b8 = 0; b8 < 4; b8++)
811     {
812         u4_blk_x = (b8 & 0x01) << 3;
813         u4_blk_y = (b8 >> 1) << 3;
814         for (b4 = 0; b4 < 4; b4++)
815         {
816             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
817             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
818 
819             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
820             /* when rdopt is off, we use the input as reference for constructing prediction buffer */
821             /* as opposed to using the recon pels. (open loop intra prediction) */
822             pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
823             pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
824             pu1_mb_c = pu1_mb_b + 4; /* pointer to top macro block */
825             pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
826 
827             /* locating neighbors that are available for prediction */
828             /* TODO : update the neighbor availability information basing on constrained intra pred information */
829             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
830             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
831 
832             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
833             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
834             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
835             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
836             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
837             /* set valid intra modes for evaluation */
838             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
839 
840             /* if top partition is available and top right is not available for intra prediction, then */
841             /* padd top right samples using top sample and make top right also available */
842             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
843 
844             /* gather prediction pels from the neighbors */
845             if (s_ngbr_avbl.u1_mb_a)
846             {
847                 for(i = 0; i < 4; i++)
848                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_src_strd];
849             }
850             else
851             {
852                 memset(pu1_ngbr_pels_i4, 0, 4);
853             }
854 
855             if (s_ngbr_avbl.u1_mb_b)
856             {
857                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
858             }
859             else
860             {
861                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
862             }
863 
864             if (s_ngbr_avbl.u1_mb_d)
865                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
866             else
867                 pu1_ngbr_pels_i4[4] = 0;
868 
869             if (s_ngbr_avbl.u1_mb_c)
870             {
871                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
872             }
873             else if (s_ngbr_avbl.u1_mb_b)
874             {
875                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
876                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
877             }
878 
879             i4_partition_cost_least = INT_MAX;
880 
881             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
882             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
883             {
884                 u4_estimated_intra_4x4_mode = DC_I4x4;
885             }
886             else
887             {
888                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
889                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
890 
891                 if (u4_pix_x == 0)
892                 {
893                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
894                     {
895                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
896                     }
897                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
898                     {
899                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
900                     }
901                 }
902                 else
903                 {
904                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
905                 }
906 
907                 if (u4_pix_y == 0)
908                 {
909                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
910                     {
911                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
912                     }
913                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
914                     {
915                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
916                     }
917                 }
918                 else
919                 {
920                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
921                 }
922 
923                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
924             }
925 
926             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
927 
928             /* mode evaluation and prediction */
929             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
930                                                          pu1_ngbr_pels_i4,
931                                                          pu1_pred_mb, i4_src_strd,
932                                                          i4_pred_strd, i4_ngbr_avbl,
933                                                          &u4_best_intra_4x4_mode,
934                                                          &i4_partition_cost_least,
935                                                          u4_valid_intra_modes,
936                                                          u4_lambda,
937                                                          u4_estimated_intra_4x4_mode);
938 
939 
940             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode) ? u4_cost_one_bit : u4_cost_four_bits);
941 
942             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
943             /* macroblock distortion */
944             i4_total_distortion += i4_partition_distortion_least;
945             i4_total_cost += i4_partition_cost_least;
946             /* mb partition mode */
947             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
948         }
949     }
950 
951     /* update the type of the mb if necessary */
952     if (i4_total_cost < ps_proc->i4_mb_cost)
953     {
954         ps_proc->i4_mb_cost = i4_total_cost;
955         ps_proc->i4_mb_distortion = i4_total_distortion;
956         ps_proc->u4_mb_type = I4x4;
957     }
958 
959     return ;
960 }
961 
962 /**
963 ******************************************************************************
964 *
965 * @brief evaluate best intra 4x4 mode (rate distortion opt on)
966 *
967 * @par Description
968 *  This function evaluates all the possible intra 4x4 modes and finds the mode
969 *  that best represents the macro-block (least distortion) and occupies fewer
970 *  bits in the bit-stream.
971 *
972 * @param[in]    ps_proc_ctxt
973 *  pointer to proc ctxt
974 *
975 * @remarks
976 *  Ideally the cost of encoding a macroblock is calculated as
977 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
978 *  input block and the reconstructed block and rate is the number of bits taken
979 *  to place the macroblock in the bit-stream. In this routine the rate does not
980 *  exactly point to the total number of bits it takes, rather it points to header
981 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
982 *  and residual bits fall in to texture bits the number of bits taken to encoding
983 *  mbtype is considered as rate, we compute cost. Further we will approximate
984 *  the distortion as the deviation b/w input and the predicted block as opposed
985 *  to input and reconstructed block.
986 *
987 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
988 *  24*lambda is added to the SAD before comparison with the best SAD for
989 *  inter prediction. This is an empirical value to prevent using too many intra
990 *  blocks.
991 *
992 * @return      none
993 *
994 ******************************************************************************
995 */
ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t * ps_proc)996 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t *ps_proc)
997 {
998     /* Codec Context */
999     codec_t *ps_codec = ps_proc->ps_codec;
1000 
1001     /* SAD(distortion metric) of an 4x4 block */
1002     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
1003 
1004     /* lambda */
1005     UWORD32 u4_lambda = ps_proc->u4_lambda;
1006 
1007     /* cost = distortion + lambda*rate */
1008     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
1009 
1010     /* cost due to mbtype */
1011     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
1012 
1013     /* intra mode */
1014     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
1015 
1016     /* neighbor pels for intra prediction */
1017     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
1018 
1019     /* pointer to curr partition */
1020     UWORD8 *pu1_mb_curr;
1021     UWORD8 *pu1_mb_ref_left, *pu1_mb_ref_top;
1022     UWORD8 *pu1_ref_mb_intra_4x4;
1023 
1024     /* pointer to residual macro block */
1025     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
1026 
1027     /* pointer to prediction macro block */
1028     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
1029 
1030     /* strides */
1031     WORD32 i4_src_strd = ps_proc->i4_src_strd;
1032     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1033     WORD32 i4_ref_strd_left, i4_ref_strd_top;
1034 
1035     /* neighbors left, top, top right, top left */
1036     UWORD8 *pu1_mb_a;
1037     UWORD8 *pu1_mb_b;
1038     UWORD8 *pu1_mb_c;
1039     UWORD8 *pu1_mb_d;
1040 
1041     /* number of non zero coeffs*/
1042     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
1043 
1044     /* quantization parameters */
1045     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1046 
1047     /* neighbor availability */
1048     WORD32 i4_ngbr_avbl;
1049     block_neighbors_t s_ngbr_avbl;
1050 
1051     /* temp vars */
1052     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
1053 
1054     /* scan order inside 4x4 block */
1055     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
1056 
1057     /* ngbr sub mb modes */
1058     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
1059     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1060 
1061     /* valid intra modes map */
1062     UWORD32 u4_valid_intra_modes;
1063     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
1064 
1065     /* Dummy variable for 4x4 trans function */
1066     WORD16 i2_dc_dummy;
1067 
1068     /* compute ngbr availability for sub blks */
1069     i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_c << 3);
1070     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
1071 
1072     for(b8 = 0; b8 < 4; b8++)
1073     {
1074         u4_blk_x = (b8 & 0x01) << 3;
1075         u4_blk_y = (b8 >> 1) << 3;
1076         for(b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
1077         {
1078             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
1079             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
1080 
1081             pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4 + u4_pix_x + (u4_pix_y * i4_pred_strd);
1082             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
1083             if (u4_pix_x == 0)
1084             {
1085                 i4_ref_strd_left = ps_proc->i4_rec_strd;
1086                 pu1_mb_ref_left = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_left);
1087             }
1088             else
1089             {
1090                 i4_ref_strd_left = i4_pred_strd;
1091                 pu1_mb_ref_left = pu1_ref_mb_intra_4x4;
1092             }
1093             if (u4_pix_y == 0)
1094             {
1095                 i4_ref_strd_top = ps_proc->i4_rec_strd;
1096                 pu1_mb_ref_top = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_top);
1097             }
1098             else
1099             {
1100                 i4_ref_strd_top = i4_pred_strd;
1101                 pu1_mb_ref_top = pu1_ref_mb_intra_4x4;
1102             }
1103 
1104             pu1_mb_a = pu1_mb_ref_left - 1; /* pointer to left macro block */
1105             pu1_mb_b = pu1_mb_ref_top - i4_ref_strd_top; /* pointer to top macro block */
1106             pu1_mb_c = pu1_mb_b + 4; /* pointer to top right macro block */
1107             if (u4_pix_y == 0)
1108                 pu1_mb_d = pu1_mb_b - 1;
1109             else
1110                 pu1_mb_d = pu1_mb_a - i4_ref_strd_left; /* pointer to top left macro block */
1111 
1112             /* locating neighbors that are available for prediction */
1113             /* TODO : update the neighbor availability information basing on constrained intra pred information */
1114             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
1115             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
1116 
1117             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
1118             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
1119             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
1120             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
1121             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
1122             /* set valid intra modes for evaluation */
1123             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
1124 
1125             /* if top partition is available and top right is not available for intra prediction, then */
1126             /* padd top right samples using top sample and make top right also available */
1127             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
1128 
1129             /* gather prediction pels from the neighbors */
1130             if (s_ngbr_avbl.u1_mb_a)
1131             {
1132                 for(i = 0; i < 4; i++)
1133                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_ref_strd_left];
1134             }
1135             else
1136             {
1137                 memset(pu1_ngbr_pels_i4,0,4);
1138             }
1139             if(s_ngbr_avbl.u1_mb_b)
1140             {
1141                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
1142             }
1143             else
1144             {
1145                 memset(pu1_ngbr_pels_i4 + 4 + 1, 0, 4);
1146             }
1147             if (s_ngbr_avbl.u1_mb_d)
1148                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
1149             else
1150                 pu1_ngbr_pels_i4[4] = 0;
1151             if (s_ngbr_avbl.u1_mb_c)
1152             {
1153                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
1154             }
1155             else if (s_ngbr_avbl.u1_mb_b)
1156             {
1157                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
1158                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
1159             }
1160 
1161             i4_partition_cost_least = INT_MAX;
1162 
1163             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
1164             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
1165             {
1166                 u4_estimated_intra_4x4_mode = DC_I4x4;
1167             }
1168             else
1169             {
1170                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
1171                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
1172 
1173                 if (u4_pix_x == 0)
1174                 {
1175                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
1176                     {
1177                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
1178                     }
1179                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
1180                     {
1181                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
1182                     }
1183                 }
1184                 else
1185                 {
1186                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
1187                 }
1188 
1189                 if (u4_pix_y == 0)
1190                 {
1191                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
1192                     {
1193                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
1194                     }
1195                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
1196                     {
1197                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
1198                     }
1199                 }
1200                 else
1201                 {
1202                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
1203                 }
1204 
1205                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
1206             }
1207 
1208             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
1209 
1210             /*mode evaluation and prediction*/
1211             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
1212                                                          pu1_ngbr_pels_i4,
1213                                                          pu1_pred_mb, i4_src_strd,
1214                                                          i4_pred_strd, i4_ngbr_avbl,
1215                                                          &u4_best_intra_4x4_mode,
1216                                                          &i4_partition_cost_least,
1217                                                          u4_valid_intra_modes,
1218                                                          u4_lambda,
1219                                                          u4_estimated_intra_4x4_mode);
1220 
1221 
1222             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode)?u4_cost_one_bit:u4_cost_four_bits);
1223 
1224             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
1225 
1226             /* macroblock distortion */
1227             i4_total_distortion += i4_partition_distortion_least;
1228             i4_total_cost += i4_partition_cost_least;
1229 
1230             /* mb partition mode */
1231             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
1232 
1233 
1234             /********************************************************/
1235             /*  error estimation,                                   */
1236             /*  transform                                           */
1237             /*  quantization                                        */
1238             /********************************************************/
1239             ps_codec->pf_resi_trans_quant_4x4(pu1_mb_curr, pu1_pred_mb,
1240                                               pi2_res_mb, i4_src_strd,
1241                                               i4_pred_strd,
1242                                               /* No op stride, this implies a buff of lenght 1x16 */
1243                                               ps_qp_params->pu2_scale_mat,
1244                                               ps_qp_params->pu2_thres_mat,
1245                                               ps_qp_params->u1_qbits,
1246                                               ps_qp_params->u4_dead_zone,
1247                                               pu1_nnz, &i2_dc_dummy);
1248 
1249             /********************************************************/
1250             /*  ierror estimation,                                  */
1251             /*  itransform                                          */
1252             /*  iquantization                                       */
1253             /********************************************************/
1254             ps_codec->pf_iquant_itrans_recon_4x4(pi2_res_mb, pu1_pred_mb,
1255                                                  pu1_ref_mb_intra_4x4,
1256                                                  i4_pred_strd, i4_pred_strd,
1257                                                  ps_qp_params->pu2_iscale_mat,
1258                                                  ps_qp_params->pu2_weigh_mat,
1259                                                  ps_qp_params->u1_qp_div,
1260                                                  ps_proc->pv_scratch_buff, 0,
1261                                                  NULL);
1262         }
1263     }
1264 
1265     /* update the type of the mb if necessary */
1266     if (i4_total_cost < ps_proc->i4_mb_cost)
1267     {
1268         ps_proc->i4_mb_cost = i4_total_cost;
1269         ps_proc->i4_mb_distortion = i4_total_distortion;
1270         ps_proc->u4_mb_type = I4x4;
1271     }
1272 
1273     return ;
1274 }
1275 
1276 /**
1277 ******************************************************************************
1278 *
1279 * @brief
1280 *  evaluate best chroma intra 8x8 mode (rate distortion opt off)
1281 *
1282 * @par Description
1283 *  This function evaluates all the possible chroma intra 8x8 modes and finds
1284 *  the mode that best represents the macroblock (least distortion) and occupies
1285 *  fewer bits in the bitstream.
1286 *
1287 * @param[in] ps_proc_ctxt
1288 *  pointer to macroblock context (handle)
1289 *
1290 * @remarks
1291 *  For chroma best intra pred mode is calculated based only on SAD
1292 *
1293 * @returns none
1294 *
1295 ******************************************************************************
1296 */
1297 
ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)1298 void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
1299 {
1300     /* Codec Context */
1301     codec_t *ps_codec = ps_proc->ps_codec;
1302 
1303     /* SAD(distortion metric) of an 8x8 block */
1304     WORD32 i4_mb_distortion, i4_chroma_mb_distortion;
1305 
1306     /* intra mode */
1307     UWORD32  u4_best_chroma_intra_8x8_mode = DC_CH_I8x8;
1308 
1309     /* neighbor pels for intra prediction */
1310     UWORD8 *pu1_ngbr_pels_c_i8x8 = ps_proc->au1_ngbr_pels;
1311 
1312     /* pointer to curr macro block */
1313     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
1314     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
1315 
1316     /* pointer to prediction macro block */
1317     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
1318     UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane;
1319 
1320     /* strides */
1321     WORD32 i4_src_strd_c = ps_proc->i4_src_chroma_strd;
1322     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1323     WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd;
1324 
1325     /* neighbors left, top, top left */
1326     UWORD8 *pu1_mb_a = pu1_ref_mb - 2;
1327     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd_c;
1328     UWORD8 *pu1_mb_d = pu1_mb_b - 2;
1329 
1330     /* neighbor availability */
1331     const UWORD8  u1_valid_intra_modes[8] = {1, 3, 9, 11, 5, 7, 13, 15,};
1332     WORD32 i4_ngbr_avbl;
1333 
1334     /* valid intra modes map */
1335     UWORD32 u4_valid_intra_modes;
1336 
1337     /* temp var */
1338     UWORD8 i;
1339 
1340     /* locating neighbors that are available for prediction */
1341     /* TODO : update the neighbor availability information basing on constrained intra pred information */
1342     /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines
1343      * basing on neighbors available and hence evade the computation of neighbor availability totally. */
1344     /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
1345     i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1);
1346     ps_proc->i4_chroma_neighbor_avail_8x8_mb = i4_ngbr_avbl;
1347 
1348     /* gather prediction pels from the neighbors */
1349     /* left pels */
1350     if (ps_proc->ps_ngbr_avbl->u1_mb_a)
1351     {
1352         for (i = 0; i < 16; i += 2)
1353         {
1354             pu1_ngbr_pels_c_i8x8[16 - 2 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c];
1355             pu1_ngbr_pels_c_i8x8[16 - 1 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c + 1];
1356         }
1357     }
1358     else
1359     {
1360         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_c_i8x8, 0, MB_SIZE);
1361     }
1362 
1363     /* top pels */
1364     if (ps_proc->ps_ngbr_avbl->u1_mb_b)
1365     {
1366         ps_codec->pf_mem_cpy_mul8(&pu1_ngbr_pels_c_i8x8[18], pu1_mb_b, 16);
1367     }
1368     else
1369     {
1370         ps_codec->pf_mem_set_mul8((pu1_ngbr_pels_c_i8x8 + 18), 0, MB_SIZE);
1371     }
1372 
1373     /* top left pels */
1374     if (ps_proc->ps_ngbr_avbl->u1_mb_d)
1375     {
1376         pu1_ngbr_pels_c_i8x8[16] = *pu1_mb_d;
1377         pu1_ngbr_pels_c_i8x8[17] = *(pu1_mb_d + 1);
1378     }
1379 
1380     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
1381 
1382     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
1383         u4_valid_intra_modes &= ~(1 << PLANE_CH_I8x8);
1384 
1385     i4_chroma_mb_distortion = INT_MAX;
1386 
1387     /* perform intra mode chroma  8x8 evaluation */
1388     /* intra prediction */
1389     ps_codec->pf_ih264e_evaluate_intra_chroma_modes(pu1_curr_mb,
1390                                                     pu1_ngbr_pels_c_i8x8,
1391                                                     pu1_pred_mb,
1392                                                     i4_src_strd_c,
1393                                                     i4_pred_strd,
1394                                                     i4_ngbr_avbl,
1395                                                     &u4_best_chroma_intra_8x8_mode,
1396                                                     &i4_chroma_mb_distortion,
1397                                                     u4_valid_intra_modes);
1398 
1399     if (u4_valid_intra_modes & 8)/* if Chroma PLANE is valid*/
1400     {
1401         (ps_codec->apf_intra_pred_c)[PLANE_CH_I8x8](pu1_ngbr_pels_c_i8x8, pu1_pred_mb_plane, 0, i4_pred_strd, i4_ngbr_avbl);
1402 
1403         /* evaluate distortion(sad) */
1404         ps_codec->pf_compute_sad_16x8(pu1_curr_mb, pu1_pred_mb_plane, i4_src_strd_c, i4_pred_strd, i4_chroma_mb_distortion, &i4_mb_distortion);
1405 
1406         /* update the least distortion information if necessary */
1407         if(i4_mb_distortion < i4_chroma_mb_distortion)
1408         {
1409             i4_chroma_mb_distortion = i4_mb_distortion;
1410             u4_best_chroma_intra_8x8_mode = PLANE_CH_I8x8;
1411         }
1412     }
1413 
1414     DEBUG("%d partition cost, %d intra mode\n", i4_chroma_mb_distortion, u4_best_chroma_intra_8x8_mode);
1415 
1416     ps_proc->u1_c_i8_mode = u4_best_chroma_intra_8x8_mode;
1417 
1418     return ;
1419 }
1420 
1421 
1422 /**
1423 ******************************************************************************
1424 *
1425 * @brief
1426 *  Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
1427 *  prediction.
1428 *
1429 * @par Description
1430 *  This function evaluates first three 16x16 modes and compute corresponding sad
1431 *  and return the buffer predicted with best mode.
1432 *
1433 * @param[in] pu1_src
1434 *  UWORD8 pointer to the source
1435 *
1436 * @param[in] pu1_ngbr_pels_i16
1437 *  UWORD8 pointer to neighbouring pels
1438 *
1439 * @param[out] pu1_dst
1440 *  UWORD8 pointer to the destination
1441 *
1442 * @param[in] src_strd
1443 *  integer source stride
1444 *
1445 * @param[in] dst_strd
1446 *  integer destination stride
1447 *
1448 * @param[in] u4_n_avblty
1449 *  availability of neighbouring pixels
1450 *
1451 * @param[in] u4_intra_mode
1452 *  Pointer to the variable in which best mode is returned
1453 *
1454 * @param[in] pu4_sadmin
1455 *  Pointer to the variable in which minimum sad is returned
1456 *
1457 * @param[in] u4_valid_intra_modes
1458 *  Says what all modes are valid
1459 *
1460 * @returns      none
1461 *
1462 ******************************************************************************
1463 */
ih264e_evaluate_intra16x16_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels_i16,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)1464 void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
1465                                       UWORD8 *pu1_ngbr_pels_i16,
1466                                       UWORD8 *pu1_dst,
1467                                       UWORD32 src_strd,
1468                                       UWORD32 dst_strd,
1469                                       WORD32 u4_n_avblty,
1470                                       UWORD32 *u4_intra_mode,
1471                                       WORD32 *pu4_sadmin,
1472                                       UWORD32 u4_valid_intra_modes)
1473 {
1474     UWORD8 *pu1_neighbour;
1475     UWORD8 *pu1_src_temp = pu1_src;
1476     UWORD8 left = 0, top = 0;
1477     WORD32 u4_dcval = 0;
1478     WORD32 i, j;
1479     WORD32 i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, i4_sad_dc = INT_MAX,
1480                     i4_min_sad = INT_MAX;
1481     UWORD8 val;
1482 
1483     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
1484     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
1485 
1486     /* left available */
1487     if (left)
1488     {
1489         i4_sad_horz = 0;
1490 
1491         for (i = 0; i < 16; i++)
1492         {
1493             val = pu1_ngbr_pels_i16[15 - i];
1494 
1495             u4_dcval += val;
1496 
1497             for (j = 0; j < 16; j++)
1498             {
1499                 i4_sad_horz += ABS(val - pu1_src_temp[j]);
1500             }
1501 
1502             pu1_src_temp += src_strd;
1503         }
1504         u4_dcval += 8;
1505     }
1506 
1507     pu1_src_temp = pu1_src;
1508     /* top available */
1509     if (top)
1510     {
1511         i4_sad_vert = 0;
1512 
1513         for (i = 0; i < 16; i++)
1514         {
1515             u4_dcval += pu1_ngbr_pels_i16[17 + i];
1516 
1517             for (j = 0; j < 16; j++)
1518             {
1519                 i4_sad_vert += ABS(pu1_ngbr_pels_i16[17 + j] - pu1_src_temp[j]);
1520             }
1521             pu1_src_temp += src_strd;
1522 
1523         }
1524         u4_dcval += 8;
1525     }
1526 
1527     u4_dcval = (u4_dcval) >> (3 + left + top);
1528 
1529     pu1_src_temp = pu1_src;
1530 
1531     /* none available */
1532     u4_dcval += (left == 0) * (top == 0) * 128;
1533 
1534     i4_sad_dc = 0;
1535 
1536     for (i = 0; i < 16; i++)
1537     {
1538         for (j = 0; j < 16; j++)
1539         {
1540             i4_sad_dc += ABS(u4_dcval - pu1_src_temp[j]);
1541         }
1542         pu1_src_temp += src_strd;
1543     }
1544 
1545     if ((u4_valid_intra_modes & 04) == 0)/* If DC is disabled */
1546         i4_sad_dc = INT_MAX;
1547 
1548     if ((u4_valid_intra_modes & 01) == 0)/* If VERT is disabled */
1549         i4_sad_vert = INT_MAX;
1550 
1551     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled */
1552         i4_sad_horz = INT_MAX;
1553 
1554     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
1555 
1556     /* Finding Minimum sad and doing corresponding prediction */
1557     if (i4_min_sad < *pu4_sadmin)
1558     {
1559         *pu4_sadmin = i4_min_sad;
1560         if (i4_min_sad == i4_sad_vert)
1561         {
1562             *u4_intra_mode = VERT_I16x16;
1563             pu1_neighbour = pu1_ngbr_pels_i16 + 17;
1564             for (j = 0; j < 16; j++)
1565             {
1566                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
1567                 pu1_dst += dst_strd;
1568             }
1569         }
1570         else if (i4_min_sad == i4_sad_horz)
1571         {
1572             *u4_intra_mode = HORZ_I16x16;
1573             for (j = 0; j < 16; j++)
1574             {
1575                 val = pu1_ngbr_pels_i16[15 - j];
1576                 memset(pu1_dst, val, MB_SIZE);
1577                 pu1_dst += dst_strd;
1578             }
1579         }
1580         else
1581         {
1582             *u4_intra_mode = DC_I16x16;
1583             for (j = 0; j < 16; j++)
1584             {
1585                 memset(pu1_dst, u4_dcval, MB_SIZE);
1586                 pu1_dst += dst_strd;
1587             }
1588         }
1589     }
1590     return;
1591 }
1592 
1593 /**
1594 ******************************************************************************
1595 *
1596 * @brief
1597 *  Evaluate best intra 4x4 mode and perform prediction.
1598 *
1599 * @par Description
1600 *  This function evaluates  4x4 modes and compute corresponding sad
1601 *  and return the buffer predicted with best mode.
1602 *
1603 * @param[in] pu1_src
1604 *  UWORD8 pointer to the source
1605 *
1606 * @param[in] pu1_ngbr_pels
1607 *  UWORD8 pointer to neighbouring pels
1608 *
1609 * @param[out] pu1_dst
1610 *  UWORD8 pointer to the destination
1611 *
1612 * @param[in] src_strd
1613 *  integer source stride
1614 *
1615 * @param[in] dst_strd
1616 *  integer destination stride
1617 *
1618 * @param[in] u4_n_avblty
1619 *  availability of neighbouring pixels
1620 *
1621 * @param[in] u4_intra_mode
1622 *  Pointer to the variable in which best mode is returned
1623 *
1624 * @param[in] pu4_sadmin
1625 *  Pointer to the variable in which minimum cost is returned
1626 *
1627 * @param[in] u4_valid_intra_modes
1628 *  Says what all modes are valid
1629 *
1630 * @param[in] u4_lambda
1631 *  Lamda value for computing cost from SAD
1632 *
1633 * @param[in] u4_predictd_mode
1634 *  Predicted mode for cost computation
1635 *
1636 * @returns      none
1637 *
1638 ******************************************************************************
1639 */
ih264e_evaluate_intra_4x4_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes,UWORD32 u4_lambda,UWORD32 u4_predictd_mode)1640 void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
1641                                      UWORD8 *pu1_ngbr_pels,
1642                                      UWORD8 *pu1_dst,
1643                                      UWORD32 src_strd,
1644                                      UWORD32 dst_strd,
1645                                      WORD32 u4_n_avblty,
1646                                      UWORD32 *u4_intra_mode,
1647                                      WORD32 *pu4_sadmin,
1648                                      UWORD32 u4_valid_intra_modes,
1649                                      UWORD32  u4_lambda,
1650                                      UWORD32 u4_predictd_mode)
1651 {
1652     UWORD8 *pu1_src_temp = pu1_src;
1653     UWORD8 *pu1_pred = pu1_ngbr_pels;
1654     UWORD8 left = 0, top = 0;
1655     UWORD8 u1_pred_val = 0;
1656     UWORD8 u1_pred_vals[4] = {0};
1657     UWORD8 *pu1_pred_val = NULL;
1658     /* To store FILT121 operated values*/
1659     UWORD8 u1_pred_vals_diag_121[15] = {0};
1660     /* To store FILT11 operated values*/
1661     UWORD8 u1_pred_vals_diag_11[15] = {0};
1662     UWORD8 u1_pred_vals_vert_r[8] = {0};
1663     UWORD8 u1_pred_vals_horz_d[10] = {0};
1664     UWORD8 u1_pred_vals_horz_u[10] = {0};
1665     WORD32 u4_dcval = 0;
1666     WORD32 i4_sad[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
1667                                INT_MAX, INT_MAX, INT_MAX, INT_MAX};
1668 
1669     WORD32 i4_cost[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
1670                                 INT_MAX, INT_MAX, INT_MAX, INT_MAX};
1671     WORD32 i, i4_min_cost = INT_MAX;
1672 
1673     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
1674     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
1675 
1676     /* Computing SAD */
1677 
1678     /* VERT mode valid */
1679     if (u4_valid_intra_modes & 1)
1680     {
1681         pu1_pred = pu1_ngbr_pels + 5;
1682         i4_sad[VERT_I4x4] = 0;
1683         i4_cost[VERT_I4x4] = 0;
1684 
1685         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1686         pu1_src_temp += src_strd;
1687         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1688         pu1_src_temp += src_strd;
1689         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1690         pu1_src_temp += src_strd;
1691         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1692 
1693         i4_cost[VERT_I4x4] = i4_sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ?
1694                                         u4_lambda : 4 * u4_lambda);
1695     }
1696 
1697     /* HORZ mode valid */
1698     if (u4_valid_intra_modes & 2)
1699     {
1700         i4_sad[HORZ_I4x4] = 0;
1701         i4_cost[HORZ_I4x4] =0;
1702         pu1_src_temp = pu1_src;
1703 
1704         u1_pred_val = pu1_ngbr_pels[3];
1705 
1706         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1707                         + ABS(pu1_src_temp[1] - u1_pred_val)
1708                         + ABS(pu1_src_temp[2] - u1_pred_val)
1709                         + ABS(pu1_src_temp[3] - u1_pred_val);
1710         pu1_src_temp += src_strd;
1711 
1712         u1_pred_val = pu1_ngbr_pels[2];
1713 
1714         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1715                         + ABS(pu1_src_temp[1] - u1_pred_val)
1716                         + ABS(pu1_src_temp[2] - u1_pred_val)
1717                         + ABS(pu1_src_temp[3] - u1_pred_val);
1718         pu1_src_temp += src_strd;
1719 
1720         u1_pred_val = pu1_ngbr_pels[1];
1721 
1722         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1723                         + ABS(pu1_src_temp[1] - u1_pred_val)
1724                         + ABS(pu1_src_temp[2] - u1_pred_val)
1725                         + ABS(pu1_src_temp[3] - u1_pred_val);
1726         pu1_src_temp += src_strd;
1727 
1728         u1_pred_val = pu1_ngbr_pels[0];
1729 
1730         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1731                         + ABS(pu1_src_temp[1] - u1_pred_val)
1732                         + ABS(pu1_src_temp[2] - u1_pred_val)
1733                         + ABS(pu1_src_temp[3] - u1_pred_val);
1734 
1735         i4_cost[HORZ_I4x4] = i4_sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ?
1736                                         u4_lambda : 4 * u4_lambda);
1737     }
1738 
1739     /* DC mode valid */
1740     if (u4_valid_intra_modes & 4)
1741     {
1742         i4_sad[DC_I4x4] = 0;
1743         i4_cost[DC_I4x4] = 0;
1744         pu1_src_temp = pu1_src;
1745 
1746         if (left)
1747             u4_dcval = pu1_ngbr_pels[0] + pu1_ngbr_pels[1] + pu1_ngbr_pels[2]
1748                             + pu1_ngbr_pels[3] + 2;
1749         if (top)
1750             u4_dcval += pu1_ngbr_pels[5] + pu1_ngbr_pels[6] + pu1_ngbr_pels[7]
1751                             + pu1_ngbr_pels[8] + 2;
1752 
1753         u4_dcval = (u4_dcval) ? (u4_dcval >> (1 + left + top)) : 128;
1754 
1755         /* none available */
1756         memset(u1_pred_vals, u4_dcval, 4);
1757         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1758         pu1_src_temp += src_strd;
1759         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1760         pu1_src_temp += src_strd;
1761         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1762         pu1_src_temp += src_strd;
1763         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1764         pu1_src_temp += src_strd;
1765 
1766         i4_cost[DC_I4x4] = i4_sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ?
1767                                         u4_lambda : 4 * u4_lambda);
1768     }
1769 
1770     /* if modes other than VERT, HORZ and DC are  valid */
1771     if (u4_valid_intra_modes > 7)
1772     {
1773         pu1_pred = pu1_ngbr_pels;
1774         pu1_pred[13] = pu1_pred[14] = pu1_pred[12];
1775 
1776         /* Performing FILT121 and FILT11 operation for all neighbour values*/
1777         for (i = 0; i < 13; i++)
1778         {
1779             u1_pred_vals_diag_121[i] = FILT121(pu1_pred[0], pu1_pred[1], pu1_pred[2]);
1780             u1_pred_vals_diag_11[i] = FILT11(pu1_pred[0], pu1_pred[1]);
1781 
1782             pu1_pred++;
1783         }
1784 
1785         if (u4_valid_intra_modes & 8)/* DIAG_DL */
1786         {
1787             i4_sad[DIAG_DL_I4x4] = 0;
1788             i4_cost[DIAG_DL_I4x4] = 0;
1789             pu1_src_temp = pu1_src;
1790             pu1_pred_val = u1_pred_vals_diag_121 + 5;
1791 
1792             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DL_I4x4]);
1793             pu1_src_temp += src_strd;
1794             USADA8(pu1_src_temp, (pu1_pred_val + 1), i4_sad[DIAG_DL_I4x4]);
1795             pu1_src_temp += src_strd;
1796             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[DIAG_DL_I4x4]);
1797             pu1_src_temp += src_strd;
1798             USADA8(pu1_src_temp, (pu1_pred_val + 3), i4_sad[DIAG_DL_I4x4]);
1799             pu1_src_temp += src_strd;
1800             i4_cost[DIAG_DL_I4x4] = i4_sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ?
1801                                             u4_lambda : 4 * u4_lambda);
1802         }
1803 
1804         if (u4_valid_intra_modes & 16)/* DIAG_DR */
1805         {
1806             i4_sad[DIAG_DR_I4x4] = 0;
1807             i4_cost[DIAG_DR_I4x4] = 0;
1808             pu1_src_temp = pu1_src;
1809             pu1_pred_val = u1_pred_vals_diag_121 + 3;
1810 
1811             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DR_I4x4]);
1812             pu1_src_temp += src_strd;
1813             USADA8(pu1_src_temp, (pu1_pred_val - 1), i4_sad[DIAG_DR_I4x4]);
1814             pu1_src_temp += src_strd;
1815             USADA8(pu1_src_temp, (pu1_pred_val - 2), i4_sad[DIAG_DR_I4x4]);
1816             pu1_src_temp += src_strd;
1817             USADA8(pu1_src_temp, (pu1_pred_val - 3), i4_sad[DIAG_DR_I4x4]);
1818             pu1_src_temp += src_strd;
1819             i4_cost[DIAG_DR_I4x4] = i4_sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ?
1820                                             u4_lambda : 4 * u4_lambda);
1821 
1822         }
1823 
1824         if (u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
1825         {
1826             i4_sad[VERT_R_I4x4] = 0;
1827 
1828             pu1_src_temp = pu1_src;
1829             u1_pred_vals_vert_r[0] = u1_pred_vals_diag_121[2];
1830             memcpy((u1_pred_vals_vert_r + 1), (u1_pred_vals_diag_11 + 4), 3);
1831             u1_pred_vals_vert_r[4] = u1_pred_vals_diag_121[1];
1832             memcpy((u1_pred_vals_vert_r + 5), (u1_pred_vals_diag_121 + 3), 3);
1833 
1834             pu1_pred_val = u1_pred_vals_diag_11 + 4;
1835             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
1836             pu1_pred_val = u1_pred_vals_diag_121 + 3;
1837             pu1_src_temp += src_strd;
1838             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
1839             pu1_src_temp += src_strd;
1840             USADA8(pu1_src_temp, (u1_pred_vals_vert_r), i4_sad[VERT_R_I4x4]);
1841             pu1_src_temp += src_strd;
1842             USADA8(pu1_src_temp, (u1_pred_vals_vert_r + 4),
1843                    i4_sad[VERT_R_I4x4]);
1844 
1845             i4_cost[VERT_R_I4x4] = i4_sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ?
1846                                             u4_lambda : 4 * u4_lambda);
1847         }
1848 
1849         if (u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
1850         {
1851             i4_sad[HORZ_D_I4x4] = 0;
1852 
1853             pu1_src_temp = pu1_src;
1854             u1_pred_vals_horz_d[6] = u1_pred_vals_diag_11[3];
1855             memcpy((u1_pred_vals_horz_d + 7), (u1_pred_vals_diag_121 + 3), 3);
1856             u1_pred_vals_horz_d[0] = u1_pred_vals_diag_11[0];
1857             u1_pred_vals_horz_d[1] = u1_pred_vals_diag_121[0];
1858             u1_pred_vals_horz_d[2] = u1_pred_vals_diag_11[1];
1859             u1_pred_vals_horz_d[3] = u1_pred_vals_diag_121[1];
1860             u1_pred_vals_horz_d[4] = u1_pred_vals_diag_11[2];
1861             u1_pred_vals_horz_d[5] = u1_pred_vals_diag_121[2];
1862 
1863             pu1_pred_val = u1_pred_vals_horz_d;
1864             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_D_I4x4]);
1865             pu1_src_temp += src_strd;
1866             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_D_I4x4]);
1867             pu1_src_temp += src_strd;
1868             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_D_I4x4]);
1869             pu1_src_temp += src_strd;
1870             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_D_I4x4]);
1871 
1872             i4_cost[HORZ_D_I4x4] = i4_sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ?
1873                                             u4_lambda : 4 * u4_lambda);
1874         }
1875 
1876         if (u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
1877         {
1878             i4_sad[VERT_L_I4x4] = 0;
1879             pu1_src_temp = pu1_src;
1880             pu1_pred_val = u1_pred_vals_diag_11 + 5;
1881             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1882             pu1_src_temp += src_strd;
1883             pu1_pred_val = u1_pred_vals_diag_121 + 5;
1884             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1885             pu1_src_temp += src_strd;
1886             pu1_pred_val = u1_pred_vals_diag_11 + 6;
1887             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1888             pu1_src_temp += src_strd;
1889             pu1_pred_val = u1_pred_vals_diag_121 + 6;
1890             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1891 
1892             i4_cost[VERT_L_I4x4] = i4_sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ?
1893                                             u4_lambda : 4 * u4_lambda);
1894         }
1895 
1896         if (u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
1897         {
1898             i4_sad[HORZ_U_I4x4] = 0;
1899             pu1_src_temp = pu1_src;
1900             u1_pred_vals_horz_u[0] = u1_pred_vals_diag_11[2];
1901             u1_pred_vals_horz_u[1] = u1_pred_vals_diag_121[1];
1902             u1_pred_vals_horz_u[2] = u1_pred_vals_diag_11[1];
1903             u1_pred_vals_horz_u[3] = u1_pred_vals_diag_121[0];
1904             u1_pred_vals_horz_u[4] = u1_pred_vals_diag_11[0];
1905             u1_pred_vals_horz_u[5] = FILT121(pu1_ngbr_pels[0], pu1_ngbr_pels[0], pu1_ngbr_pels[1]);
1906 
1907             memset((u1_pred_vals_horz_u + 6), pu1_ngbr_pels[0], 4);
1908 
1909             pu1_pred_val = u1_pred_vals_horz_u;
1910             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_U_I4x4]);
1911             pu1_src_temp += src_strd;
1912             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_U_I4x4]);
1913             pu1_src_temp += src_strd;
1914             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_U_I4x4]);
1915             pu1_src_temp += src_strd;
1916             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_U_I4x4]);
1917 
1918             i4_cost[HORZ_U_I4x4] = i4_sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ?
1919                                             u4_lambda : 4 * u4_lambda);
1920         }
1921 
1922         i4_min_cost = MIN3(MIN3(i4_cost[0], i4_cost[1], i4_cost[2]),
1923                         MIN3(i4_cost[3], i4_cost[4], i4_cost[5]),
1924                         MIN3(i4_cost[6], i4_cost[7], i4_cost[8]));
1925 
1926     }
1927     else
1928     {
1929         /* Only first three modes valid */
1930         i4_min_cost = MIN3(i4_cost[0], i4_cost[1], i4_cost[2]);
1931     }
1932 
1933     *pu4_sadmin = i4_min_cost;
1934 
1935     if (i4_min_cost == i4_cost[0])
1936     {
1937         *u4_intra_mode = VERT_I4x4;
1938         pu1_pred_val = pu1_ngbr_pels + 5;
1939         memcpy(pu1_dst, (pu1_pred_val), 4);
1940         pu1_dst += dst_strd;
1941         memcpy(pu1_dst, (pu1_pred_val), 4);
1942         pu1_dst += dst_strd;
1943         memcpy(pu1_dst, (pu1_pred_val), 4);
1944         pu1_dst += dst_strd;
1945         memcpy(pu1_dst, (pu1_pred_val), 4);
1946     }
1947     else if (i4_min_cost == i4_cost[1])
1948     {
1949         *u4_intra_mode = HORZ_I4x4;
1950         memset(pu1_dst, pu1_ngbr_pels[3], 4);
1951         pu1_dst += dst_strd;
1952         memset(pu1_dst, pu1_ngbr_pels[2], 4);
1953         pu1_dst += dst_strd;
1954         memset(pu1_dst, pu1_ngbr_pels[1], 4);
1955         pu1_dst += dst_strd;
1956         memset(pu1_dst, pu1_ngbr_pels[0], 4);
1957     }
1958     else if (i4_min_cost == i4_cost[2])
1959     {
1960         *u4_intra_mode = DC_I4x4;
1961         memset(pu1_dst, u4_dcval, 4);
1962         pu1_dst += dst_strd;
1963         memset(pu1_dst, u4_dcval, 4);
1964         pu1_dst += dst_strd;
1965         memset(pu1_dst, u4_dcval, 4);
1966         pu1_dst += dst_strd;
1967         memset(pu1_dst, u4_dcval, 4);
1968     }
1969 
1970     else if (i4_min_cost == i4_cost[3])
1971     {
1972         *u4_intra_mode = DIAG_DL_I4x4;
1973         pu1_pred_val = u1_pred_vals_diag_121 + 5;
1974         memcpy(pu1_dst, (pu1_pred_val), 4);
1975         pu1_dst += dst_strd;
1976         memcpy(pu1_dst, (pu1_pred_val + 1), 4);
1977         pu1_dst += dst_strd;
1978         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
1979         pu1_dst += dst_strd;
1980         memcpy(pu1_dst, (pu1_pred_val + 3), 4);
1981     }
1982     else if (i4_min_cost == i4_cost[4])
1983     {
1984         *u4_intra_mode = DIAG_DR_I4x4;
1985         pu1_pred_val = u1_pred_vals_diag_121 + 3;
1986 
1987         memcpy(pu1_dst, (pu1_pred_val), 4);
1988         pu1_dst += dst_strd;
1989         memcpy(pu1_dst, (pu1_pred_val - 1), 4);
1990         pu1_dst += dst_strd;
1991         memcpy(pu1_dst, (pu1_pred_val - 2), 4);
1992         pu1_dst += dst_strd;
1993         memcpy(pu1_dst, (pu1_pred_val - 3), 4);
1994     }
1995 
1996     else if (i4_min_cost == i4_cost[5])
1997     {
1998         *u4_intra_mode = VERT_R_I4x4;
1999         pu1_pred_val = u1_pred_vals_diag_11 + 4;
2000         memcpy(pu1_dst, (pu1_pred_val), 4);
2001         pu1_dst += dst_strd;
2002         pu1_pred_val = u1_pred_vals_diag_121 + 3;
2003         memcpy(pu1_dst, (pu1_pred_val), 4);
2004         pu1_dst += dst_strd;
2005         memcpy(pu1_dst, (u1_pred_vals_vert_r), 4);
2006         pu1_dst += dst_strd;
2007         memcpy(pu1_dst, (u1_pred_vals_vert_r + 4), 4);
2008     }
2009     else if (i4_min_cost == i4_cost[6])
2010     {
2011         *u4_intra_mode = HORZ_D_I4x4;
2012         pu1_pred_val = u1_pred_vals_horz_d;
2013         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
2014         pu1_dst += dst_strd;
2015         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
2016         pu1_dst += dst_strd;
2017         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2018         pu1_dst += dst_strd;
2019         memcpy(pu1_dst, (pu1_pred_val), 4);
2020         pu1_dst += dst_strd;
2021     }
2022     else if (i4_min_cost == i4_cost[7])
2023     {
2024         *u4_intra_mode = VERT_L_I4x4;
2025         pu1_pred_val = u1_pred_vals_diag_11 + 5;
2026         memcpy(pu1_dst, (pu1_pred_val), 4);
2027         pu1_dst += dst_strd;
2028         pu1_pred_val = u1_pred_vals_diag_121 + 5;
2029         memcpy(pu1_dst, (pu1_pred_val), 4);
2030         pu1_dst += dst_strd;
2031         pu1_pred_val = u1_pred_vals_diag_11 + 6;
2032         memcpy(pu1_dst, (pu1_pred_val), 4);
2033         pu1_dst += dst_strd;
2034         pu1_pred_val = u1_pred_vals_diag_121 + 6;
2035         memcpy(pu1_dst, (pu1_pred_val), 4);
2036     }
2037     else if (i4_min_cost == i4_cost[8])
2038     {
2039         *u4_intra_mode = HORZ_U_I4x4;
2040         pu1_pred_val = u1_pred_vals_horz_u;
2041         memcpy(pu1_dst, (pu1_pred_val), 4);
2042         pu1_dst += dst_strd;
2043         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2044         pu1_dst += dst_strd;
2045         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
2046         pu1_dst += dst_strd;
2047         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
2048         pu1_dst += dst_strd;
2049     }
2050 
2051     return;
2052 }
2053 
2054 /**
2055 ******************************************************************************
2056 *
2057 * @brief:
2058 *  Evaluate best intr chroma mode (among VERT, HORZ and DC ) and do the prediction.
2059 *
2060 * @par Description
2061 *  This function evaluates  first three intra chroma modes and compute corresponding sad
2062 *  and return the buffer predicted with best mode.
2063 *
2064 * @param[in] pu1_src
2065 *  UWORD8 pointer to the source
2066 *
2067 * @param[in] pu1_ngbr_pels
2068 *  UWORD8 pointer to neighbouring pels
2069 *
2070 * @param[out] pu1_dst
2071 *  UWORD8 pointer to the destination
2072 *
2073 * @param[in] src_strd
2074 *  integer source stride
2075 *
2076 * @param[in] dst_strd
2077 *  integer destination stride
2078 *
2079 * @param[in] u4_n_avblty
2080 *  availability of neighbouring pixels
2081 *
2082 * @param[in] u4_intra_mode
2083 *  Pointer to the variable in which best mode is returned
2084 *
2085 * @param[in] pu4_sadmin
2086 *  Pointer to the variable in which minimum sad is returned
2087 *
2088 * @param[in] u4_valid_intra_modes
2089 *  Says what all modes are valid
2090 *
2091 * @return      none
2092 *
2093 ******************************************************************************
2094 */
ih264e_evaluate_intra_chroma_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)2095 void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
2096                                         UWORD8 *pu1_ngbr_pels,
2097                                         UWORD8 *pu1_dst,
2098                                         UWORD32 src_strd,
2099                                         UWORD32 dst_strd,
2100                                         WORD32 u4_n_avblty,
2101                                         UWORD32 *u4_intra_mode,
2102                                         WORD32 *pu4_sadmin,
2103                                         UWORD32 u4_valid_intra_modes)
2104 {
2105     UWORD8 *pu1_neighbour;
2106     UWORD8 *pu1_src_temp = pu1_src;
2107     UWORD8 left = 0, top = 0;
2108     WORD32 u4_dcval_u_l[2] = { 0, 0 }, /*sum left neighbours for 'U' ,two separate sets - sum of first four from top,and sum of four values from bottom */
2109            u4_dcval_u_t[2] = { 0, 0 };  /*sum top neighbours for 'U'*/
2110 
2111     WORD32 u4_dcval_v_l[2] = { 0, 0 }, /*sum left neighbours for 'V'*/
2112            u4_dcval_v_t[2] = { 0, 0 }; /*sum top neighbours for 'V'*/
2113 
2114     WORD32 i, j, row, col, i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX,
2115                     i4_sad_dc = INT_MAX, i4_min_sad = INT_MAX;
2116     UWORD8 val_u, val_v;
2117 
2118     WORD32 u4_dc_val[2][2][2];/*  -----------
2119                                   |    |    |  Chroma can have four
2120                                   | 00 | 01 |  separate dc value...
2121                                   -----------  u4_dc_val corresponds to this dc values
2122                                   |    |    |  with u4_dc_val[2][2][U] and u4_dc_val[2][2][V]
2123                                   | 10 | 11 |
2124                                   -----------                */
2125     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
2126     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
2127 
2128     /*Evaluating HORZ*/
2129     if (left)/* Ifleft available*/
2130     {
2131         i4_sad_horz = 0;
2132 
2133         for (i = 0; i < 8; i++)
2134         {
2135             val_v = pu1_ngbr_pels[15 - 2 * i];
2136             val_u = pu1_ngbr_pels[15 - 2 * i - 1];
2137             row = i / 4;
2138             u4_dcval_u_l[row] += val_u;
2139             u4_dcval_v_l[row] += val_v;
2140             for (j = 0; j < 8; j++)
2141             {
2142                 i4_sad_horz += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for HORZ mode*/
2143                 i4_sad_horz += ABS(val_v - pu1_src_temp[2 * j + 1]);
2144             }
2145 
2146             pu1_src_temp += src_strd;
2147         }
2148         u4_dcval_u_l[0] += 2;
2149         u4_dcval_u_l[1] += 2;
2150         u4_dcval_v_l[0] += 2;
2151         u4_dcval_v_l[1] += 2;
2152     }
2153 
2154     /*Evaluating VERT**/
2155     pu1_src_temp = pu1_src;
2156     if (top) /* top available*/
2157     {
2158         i4_sad_vert = 0;
2159 
2160         for (i = 0; i < 8; i++)
2161         {
2162             col = i / 4;
2163 
2164             val_u = pu1_ngbr_pels[18 + i * 2];
2165             val_v = pu1_ngbr_pels[18 + i * 2 + 1];
2166             u4_dcval_u_t[col] += val_u;
2167             u4_dcval_v_t[col] += val_v;
2168 
2169             for (j = 0; j < 16; j++)
2170             {
2171                 i4_sad_vert += ABS(pu1_ngbr_pels[18 + j] - pu1_src_temp[j]);/* Finding SAD for VERT mode*/
2172             }
2173             pu1_src_temp += src_strd;
2174 
2175         }
2176         u4_dcval_u_t[0] += 2;
2177         u4_dcval_u_t[1] += 2;
2178         u4_dcval_v_t[0] += 2;
2179         u4_dcval_v_t[1] += 2;
2180     }
2181 
2182     /* computing DC value*/
2183     /* Equation  8-128 in spec*/
2184     u4_dc_val[0][0][0] = (u4_dcval_u_l[0] + u4_dcval_u_t[0]) >> (1 + left + top);
2185     u4_dc_val[0][0][1] = (u4_dcval_v_l[0] + u4_dcval_v_t[0]) >> (1 + left + top);
2186     u4_dc_val[1][1][0] = (u4_dcval_u_l[1] + u4_dcval_u_t[1]) >> (1 + left + top);
2187     u4_dc_val[1][1][1] = (u4_dcval_v_l[1] + u4_dcval_v_t[1]) >> (1 + left + top);
2188 
2189     if (top)
2190     {
2191         /* Equation  8-132 in spec*/
2192         u4_dc_val[0][1][0] = (u4_dcval_u_t[1]) >> (1 + top);
2193         u4_dc_val[0][1][1] = (u4_dcval_v_t[1]) >> (1 + top);
2194     }
2195     else
2196     {
2197         u4_dc_val[0][1][0] = (u4_dcval_u_l[0]) >> (1 + left);
2198         u4_dc_val[0][1][1] = (u4_dcval_v_l[0]) >> (1 + left);
2199     }
2200 
2201     if (left)
2202     {
2203         u4_dc_val[1][0][0] = (u4_dcval_u_l[1]) >> (1 + left);
2204         u4_dc_val[1][0][1] = (u4_dcval_v_l[1]) >> (1 + left);
2205     }
2206     else
2207     {
2208         u4_dc_val[1][0][0] = (u4_dcval_u_t[0]) >> (1 + top);
2209         u4_dc_val[1][0][1] = (u4_dcval_v_t[0]) >> (1 + top);
2210     }
2211 
2212     if (!(left || top))
2213     {
2214         /*none available*/
2215         u4_dc_val[0][0][0] = u4_dc_val[0][0][1] =
2216         u4_dc_val[0][1][0] = u4_dc_val[0][1][1] =
2217         u4_dc_val[1][0][0] = u4_dc_val[1][0][1] =
2218         u4_dc_val[1][1][0] = u4_dc_val[1][1][1] = 128;
2219     }
2220 
2221     /* Evaluating DC */
2222     pu1_src_temp = pu1_src;
2223     i4_sad_dc = 0;
2224     for (i = 0; i < 8; i++)
2225     {
2226         for (j = 0; j < 8; j++)
2227         {
2228             col = j / 4;
2229             row = i / 4;
2230             val_u = u4_dc_val[row][col][0];
2231             val_v = u4_dc_val[row][col][1];
2232 
2233             i4_sad_dc += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for DC mode*/
2234             i4_sad_dc += ABS(val_v - pu1_src_temp[2 * j + 1]);
2235         }
2236         pu1_src_temp += src_strd;
2237     }
2238 
2239     if ((u4_valid_intra_modes & 01) == 0)/* If DC is disabled*/
2240         i4_sad_dc = INT_MAX;
2241     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled*/
2242         i4_sad_horz = INT_MAX;
2243     if ((u4_valid_intra_modes & 04) == 0)/* If VERT is disabled*/
2244         i4_sad_vert = INT_MAX;
2245 
2246     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
2247 
2248     /* Finding Minimum sad and doing corresponding prediction*/
2249     if (i4_min_sad < *pu4_sadmin)
2250     {
2251         *pu4_sadmin = i4_min_sad;
2252 
2253         if (i4_min_sad == i4_sad_dc)
2254         {
2255             *u4_intra_mode = DC_CH_I8x8;
2256             for (i = 0; i < 8; i++)
2257             {
2258                 for (j = 0; j < 8; j++)
2259                 {
2260                     col = j / 4;
2261                     row = i / 4;
2262 
2263                     pu1_dst[2 * j] = u4_dc_val[row][col][0];
2264                     pu1_dst[2 * j + 1] = u4_dc_val[row][col][1];
2265                 }
2266                 pu1_dst += dst_strd;
2267             }
2268         }
2269         else if (i4_min_sad == i4_sad_horz)
2270         {
2271             *u4_intra_mode = HORZ_CH_I8x8;
2272             for (j = 0; j < 8; j++)
2273             {
2274                 val_v = pu1_ngbr_pels[15 - 2 * j];
2275                 val_u = pu1_ngbr_pels[15 - 2 * j - 1];
2276 
2277                 for (i = 0; i < 8; i++)
2278                 {
2279                     pu1_dst[2 * i] = val_u;
2280                     pu1_dst[2 * i + 1] = val_v;
2281 
2282                 }
2283                 pu1_dst += dst_strd;
2284             }
2285         }
2286         else
2287         {
2288             *u4_intra_mode = VERT_CH_I8x8;
2289             pu1_neighbour = pu1_ngbr_pels + 18;
2290             for (j = 0; j < 8; j++)
2291             {
2292                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
2293                 pu1_dst += dst_strd;
2294             }
2295         }
2296     }
2297 
2298     return;
2299 }
2300