1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ih264e_intra_modes_eval_ssse3.c
24 *
25 * @brief
26 *   This file contains definitions of routines that perform rate distortion
27 *  analysis on a macroblock if they are to be coded as intra.
28 *
29 * @author
30 *  Ittiam
31 *
32 * @par List of Functions:
33 *  ih264e_evaluate_intra16x16_modes_ssse3
34 *  ih264e_evaluate_intra_4x4_modes_ssse3
35 *  ih264e_evaluate_intra_chroma_modes_ssse3
36 *
37 * @remarks
38 *  None
39 *
40 *******************************************************************************
41 */
42 
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 
47 /* System include files */
48 #include <stdio.h>
49 #include <string.h>
50 #include <limits.h>
51 #include <assert.h>
52 #include <immintrin.h>
53 
54 /* User include files */
55 #include "ih264e_config.h"
56 #include "ih264_typedefs.h"
57 #include "ih264e_defs.h"
58 #include "iv2.h"
59 #include "ive2.h"
60 #include "ih264_debug.h"
61 #include "ih264_defs.h"
62 #include "ih264_macros.h"
63 #include "ih264_intra_pred_filters.h"
64 #include "ih264_structs.h"
65 #include "ih264_common_tables.h"
66 #include "ih264_trans_quant_itrans_iquant.h"
67 #include "ih264_inter_pred_filters.h"
68 #include "ih264_mem_fns.h"
69 #include "ih264_padding.h"
70 #include "ih264_deblk_edge_filters.h"
71 #include "ime_distortion_metrics.h"
72 #include "ih264e_error.h"
73 #include "ih264e_bitstream.h"
74 #include "ime_defs.h"
75 #include "ime_structs.h"
76 #include "ih264_cabac_tables.h"
77 #include "irc_cntrl_param.h"
78 #include "irc_frame_info_collector.h"
79 #include "ih264e_rate_control.h"
80 
81 #include "ih264e_cabac_structs.h"
82 #include "ih264e_structs.h"
83 #include "ih264e_cabac.h"
84 #include "ih264e_intra_modes_eval.h"
85 #include "ih264e_globals.h"
86 #include "ime_platform_macros.h"
87 
88 
89 /*****************************************************************************/
90 /* Function Definitions                                                      */
91 /*****************************************************************************/
92 /**
93 ******************************************************************************
94 *
95 * @brief
96 *  evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
97 *  prediction.
98 *
99 * @par Description
100 *  This function evaluates first three 16x16 modes and compute corresponding
101 *  SAD and returns the buffer predicted with best mode.
102 *
103 * @param[in] pu1_src
104 *  UWORD8 pointer to the source
105 *
106 * @param[in] pu1_ngbr_pels_i16
107 *  UWORD8 pointer to neighbouring pels
108 *
109 * @param[out] pu1_dst
110 *  UWORD8 pointer to the destination
111 *
112 * @param[in] src_strd
113 *  integer source stride
114 *
115 * @param[in] dst_strd
116 *  integer destination stride
117 *
118 * @param[in] u4_n_avblty
119 *  availability of neighbouring pixels
120 *
121 * @param[in] u4_intra_mode
122 *  pointer to the variable in which best mode is returned
123 *
124 * @param[in] pu4_sadmin
125 *  pointer to the variable in which minimum sad is returned
126 *
127 * @param[in] u4_valid_intra_modes
128 *  says what all modes are valid
129 *
130 * @return
131 *  None
132 *
133 ******************************************************************************
134 */
ih264e_evaluate_intra16x16_modes_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels_i16,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)135 void ih264e_evaluate_intra16x16_modes_ssse3(UWORD8 *pu1_src,
136                                             UWORD8 *pu1_ngbr_pels_i16,
137                                             UWORD8 *pu1_dst,
138                                             UWORD32 src_strd,
139                                             UWORD32 dst_strd,
140                                             WORD32 n_avblty,
141                                             UWORD32 *u4_intra_mode,
142                                             WORD32 *pu4_sadmin,
143                                             UWORD32 u4_valid_intra_modes)
144 {
145     UWORD8 *pu1_src_temp;
146 
147     WORD32 left, top, horz_flag, vert_flag, dc_flag;
148     WORD32 sad_vert, sad_horz, sad_dc, min_sad;
149 
150     WORD32 cnt, dcval;
151     WORD32 src_strd2, src_strd3, src_strd4;
152     WORD32 dst_strd2, dst_strd3, dst_strd4;
153 
154     __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b;
155     __m128i val1_16x8b, val2_16x8b, val3_16x8b, val4_16x8b;
156     __m128i sad1_8x16b, sad2_8x16b, sad3_8x16b, sad4_8x16b;
157 
158     __m128i sad_8x16b, val_16x8b, zero_vector;
159 
160     sad_vert = INT_MAX;
161     sad_horz = INT_MAX;
162     sad_dc = INT_MAX;
163 
164     src_strd2 = src_strd << 1;
165     src_strd4 = src_strd << 2;
166     src_strd3 = src_strd + src_strd2;
167 
168     dst_strd2 = dst_strd << 1;
169     dst_strd4 = dst_strd << 2;
170     dst_strd3 = dst_strd + dst_strd2;
171 
172     left = (n_avblty & LEFT_MB_AVAILABLE_MASK);
173     top = (n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
174 
175     zero_vector = _mm_setzero_si128();
176 
177     horz_flag = left && ((u4_valid_intra_modes & 02) != 0);
178     vert_flag = top && ((u4_valid_intra_modes & 01) != 0);
179     dc_flag = (u4_valid_intra_modes & 04) != 0;
180 
181     if(horz_flag)
182     {
183         pu1_src_temp = pu1_src;
184 
185         val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[15]);
186         val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[14]);
187         val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[13]);
188         val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[12]);
189 
190         src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
191         src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
192         src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
193         src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
194 
195         sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
196         sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b);
197         sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b);
198         sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b);
199 
200         sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
201         sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
202 
203         cnt = 11;
204         sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
205         do
206         {
207             pu1_src_temp += src_strd4;
208 
209             val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]);
210             val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]);
211             val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]);
212             val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]);
213 
214             src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
215             src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
216             src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
217             src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
218 
219             sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
220             sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b);
221             sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b);
222             sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b);
223 
224             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
225             sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
226             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
227 
228             cnt -= 4;
229             sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b);
230         }
231         while(cnt >= 0);
232 
233         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
234         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
235         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
236 
237         sad_horz = _mm_extract_epi16(sad_8x16b, 0);
238     }
239 
240     if(vert_flag)
241     {
242         pu1_src_temp = pu1_src;
243 
244         val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17));
245 
246         src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
247         src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
248         src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
249         src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
250 
251         sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
252         sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
253         sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
254         sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
255 
256         sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
257         sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
258 
259         cnt = 11;
260         sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
261         do
262         {
263             pu1_src_temp += src_strd4;
264 
265             src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
266             src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
267             src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
268             src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
269 
270             sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
271             sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
272             sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
273             sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
274 
275             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
276             sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
277             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
278 
279             cnt -= 4;
280             sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b);
281         }
282         while(cnt >= 0);
283 
284         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
285         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
286         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
287 
288         sad_vert = _mm_extract_epi16(sad_8x16b, 0);
289     }
290 
291     dcval = 0;
292 
293     if(left)
294     {
295         val_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels_i16);
296         dcval += 8;
297 
298         sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
299         dcval += _mm_extract_epi16(sad1_8x16b, 0);
300         dcval += _mm_extract_epi16(sad1_8x16b, 4);
301     }
302     if(top)
303     {
304         val_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17));
305         dcval += 8;
306 
307         sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
308         dcval += _mm_extract_epi16(sad1_8x16b, 0);
309         dcval += _mm_extract_epi16(sad1_8x16b, 4);
310     }
311     dcval = dcval >> (3 + left + top);
312     dcval += ((left == 0) & (top == 0)) << 7;
313 
314     if(dc_flag)
315     {
316         pu1_src_temp = pu1_src;
317         val1_16x8b = _mm_set1_epi8(dcval);
318 
319         src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
320         src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
321         src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
322         src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
323 
324         sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
325         sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
326         sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
327         sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
328 
329         sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
330         sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
331 
332         cnt = 12;
333         sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
334         do
335         {
336             pu1_src_temp += src_strd4;
337 
338             src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
339             src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
340             src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
341             src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
342 
343             sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
344             sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
345             sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
346             sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
347 
348             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
349             sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
350             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
351 
352             cnt -= 4;
353             sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b);
354         }
355         while(cnt > 0);
356 
357         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
358         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
359         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
360 
361         sad_dc = _mm_extract_epi16(sad_8x16b, 0);
362     }
363 
364     // Doing prediction for minimum SAD
365     min_sad = MIN3(sad_horz, sad_vert, sad_dc);
366     if(min_sad < *pu4_sadmin)
367     {
368         *pu4_sadmin = min_sad;
369         if(min_sad == sad_vert)
370         {
371             *u4_intra_mode = VERT_I16x16;
372             val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17));
373             cnt = 15;
374             do
375             {
376                 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b);
377                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b);
378                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b);
379                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b);
380 
381                 cnt -= 4;
382                 pu1_dst += dst_strd4;
383             }
384             while(cnt > 0);
385         }
386         else if(min_sad == sad_horz)
387         {
388             *u4_intra_mode = HORZ_I16x16;
389             cnt = 15;
390             do
391             {
392                 val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]);
393                 val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]);
394                 val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]);
395                 val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]);
396 
397                 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b);
398                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val2_16x8b);
399                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val3_16x8b);
400                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val4_16x8b);
401 
402                 cnt -= 4;
403                 pu1_dst += dst_strd4;
404             }
405             while(cnt >= 0);
406         }
407         else
408         {
409             *u4_intra_mode = DC_I16x16;
410             val1_16x8b = _mm_set1_epi8(dcval);
411             cnt = 15;
412             do
413             {
414                 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b);
415                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b);
416                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b);
417                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b);
418 
419                 cnt -= 4;
420                 pu1_dst += dst_strd4;
421             }
422             while(cnt > 0);
423         }
424     }
425 }
426 
427 /**
428 ******************************************************************************
429 *
430 * @brief :Evaluate best intra 4x4 mode and do the prediction.
431 *
432 * @par Description
433 *  This function evaluates intra 4x4 modes, computes corresponding sad
434 *  and returns the buffer predicted with best mode.
435 *
436 * @param[in] pu1_src
437 *  UWORD8 pointer to the source
438 *
439 ** @param[in] pu1_ngbr_pels
440 *  UWORD8 pointer to neighbouring pels
441 *
442 * @param[out] pu1_dst
443 *  UWORD8 pointer to the destination
444 *
445 * @param[in] src_strd
446 *  integer source stride
447 *
448 * @param[in] dst_strd
449 *  integer destination stride
450 *
451 * @param[in] u4_n_avblty
452 * availability of neighbouring pixels
453 *
454 * @param[in] u4_intra_mode
455 * Pointer to the variable in which best mode is returned
456 *
457 * @param[in] pu4_sadmin
458 * Pointer to the variable in which minimum cost is returned
459 *
460 * @param[in] u4_valid_intra_modes
461 * Says what all modes are valid
462 *
463 * * @param[in] u4_lambda
464 * Lamda value for computing cost from SAD
465 *
466 * @param[in] u4_predictd_mode
467 * Predicted mode for cost computation
468 *
469 * @return      none
470 *
471 ******************************************************************************
472 */
ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes,UWORD32 u4_lambda,UWORD32 u4_predictd_mode)473 void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src,
474                                            UWORD8 *pu1_ngbr_pels,
475                                            UWORD8 *pu1_dst,
476                                            UWORD32 src_strd,
477                                            UWORD32 dst_strd,
478                                            WORD32 u4_n_avblty,
479                                            UWORD32 *u4_intra_mode,
480                                            WORD32 *pu4_sadmin,
481                                            UWORD32 u4_valid_intra_modes,
482                                            UWORD32 u4_lambda,
483                                            UWORD32 u4_predictd_mode)
484 {
485     WORD32 left, top;
486     WORD32 sad[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
487                              INT_MAX, INT_MAX, INT_MAX, INT_MAX };
488     WORD32 cost[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
489                               INT_MAX, INT_MAX, INT_MAX, INT_MAX };
490 
491     WORD32 min_cost;
492     UWORD32 lambda4 = u4_lambda << 2;
493     WORD32 dst_strd2, dst_strd3;
494 
495     __m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b;
496     __m128i pred1_16x8b, pred2_16x8b, pred3_16x8b, pred4_16x8b;
497     __m128i pred5_16x8b, pred6_16x8b, pred7_16x8b, pred8_16x8b;
498     __m128i shuffle_16x8b, zero_vector, mask_low_32b;
499 
500     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
501     top  =  (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
502 
503     dst_strd2 = dst_strd << 1;
504     dst_strd3 = dst_strd + dst_strd2;
505 
506     // loading the 4x4 source block and neighbouring pixels
507     {
508         __m128i row1_16x8b, row2_16x8b;
509 
510         row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
511         row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
512         left_top_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels);
513 
514         pu1_src += src_strd << 1;
515         src_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b);
516 
517         row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
518         row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
519         zero_vector = _mm_setzero_si128();
520 
521         row1_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b);
522         src_16x8b = _mm_unpacklo_epi64(src_16x8b, row1_16x8b);
523     }
524 
525     /* Computing SADs*/
526     if(u4_valid_intra_modes & 1)/* VERT mode valid ????*/
527     {
528         pred0_16x8b = _mm_srli_si128(left_top_16x8b, 5);
529         pred0_16x8b = _mm_shuffle_epi32(pred0_16x8b, 0);
530         sad_8x16b = _mm_sad_epu8(src_16x8b, pred0_16x8b);
531 
532         sad[VERT_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
533         cost[VERT_I4x4] = sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ? u4_lambda: lambda4);
534     }
535 
536     if(u4_valid_intra_modes & 2)/* HORZ mode valid ????*/
537     {
538         shuffle_16x8b = _mm_setr_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
539         pred1_16x8b = _mm_shuffle_epi8(left_top_16x8b, shuffle_16x8b);
540 
541         sad_8x16b = _mm_sad_epu8(src_16x8b, pred1_16x8b);
542 
543         sad[HORZ_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
544         cost[HORZ_I4x4] = sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ? u4_lambda: lambda4);
545     }
546 
547     if(u4_valid_intra_modes & 4)/* DC mode valid ????*/
548     {
549         if(top + left)
550         {
551             WORD32 shft = 1, dcval = 0;
552 
553             __m128i val_16x8b, temp_16x8b, temp_8x16b;
554 
555             val_16x8b = _mm_setzero_si128();
556 
557             if(top)
558             {
559                 temp_16x8b = _mm_srli_si128(left_top_16x8b, 5);
560                 val_16x8b = _mm_alignr_epi8(temp_16x8b, val_16x8b, 4);
561                 shft ++;
562                 dcval += 2;
563             }
564             if(left)
565             {
566                 val_16x8b = _mm_alignr_epi8(left_top_16x8b, val_16x8b, 4);
567                 shft++;
568                 dcval += 2;
569             }
570 
571             temp_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
572             dcval += _mm_extract_epi16(temp_8x16b, 4);
573             dcval = dcval >> shft;
574             pred2_16x8b = _mm_set1_epi8(dcval);
575         }
576         else
577             pred2_16x8b = _mm_set1_epi8(128);
578 
579         sad_8x16b = _mm_sad_epu8(src_16x8b, pred2_16x8b);
580 
581         sad[DC_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
582         cost[DC_I4x4] = sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ? u4_lambda: lambda4);
583     }
584 
585     if(u4_valid_intra_modes > 7)/* if modes other than VERT, HORZ and DC are  valid ????*/
586     {
587         __m128i w11_16x8b, w121_16x8b;
588         __m128i temp1_16x8b, temp2_16x8b;
589 
590         /* Performing FILT121 and FILT11 operation for all neighbour values*/
591         {
592             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b;
593             __m128i const_2_8x16b;
594 
595             const_2_8x16b = _mm_set1_epi16(2);
596 
597             temp1_8x16b = _mm_unpacklo_epi8(left_top_16x8b, zero_vector);   //l3 l2 l1 l0 tl t0 t1 t2
598             temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2);                   // 0 l3 l2 l1 l0 tl t0 t1
599             temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5);           //l3 l3 l2 l1 l0 tl t0 t1
600 
601             temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b);          //l3+l3  l3+l2       l2+l1...       t1+t2
602             temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2);                   //l3+l3  l3+l3       l3+l2...       t0+t1
603             temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5);
604             temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b);          //4*l3   l3+2*l3+l2  l3+2*l2+l1...  t0+2*t1+t2
605 
606             temp1_8x16b = _mm_add_epi16(const_2_8x16b, temp1_8x16b);        //4*l3+2 3*l3+l2+2   l3+2*l2+l1+2.. t0+2*t1+t2+2
607             temp1_8x16b = _mm_srli_epi16(temp1_8x16b, 2);
608 
609             temp1_16x8b = _mm_srli_si128(left_top_16x8b, 1);
610             w11_16x8b = _mm_avg_epu8(left_top_16x8b, temp1_16x8b);
611 
612             temp2_16x8b = _mm_srli_si128(left_top_16x8b, 6);
613             temp2_8x16b = _mm_unpacklo_epi8(temp2_16x8b, zero_vector);      //t1 t2 t3 t4 t5 t6 t7 0
614             temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2);                   //t2 t3 t4 t5 t6 t7 0  0
615             temp3_8x16b = _mm_shufflehi_epi16(temp3_8x16b, 0xd4);           //t2 t3 t4 t5 t6 t7 t7 0
616 
617             temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b);          //t1+t2      t2+t3...     t6+t7      t7+t7 0
618             temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2);                   //t2+t3      t3+t4...     t7+t7      0     0
619             temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b);          //t1+2*t2+t3 t2+2*t3+t4.. t6+2*t7+t7 t7+t7 0
620 
621             temp2_8x16b = _mm_add_epi16(const_2_8x16b, temp2_8x16b);        //t1+2*t2+t3+2 t2+2*t3+t4+2 t3+2*t4+t5+2... t6+2*t7+t7+2 t7+t7+2  2
622             temp2_8x16b = _mm_srli_epi16(temp2_8x16b, 2);
623 
624             w121_16x8b = _mm_packus_epi16(temp1_8x16b, temp2_8x16b);
625         }
626 
627         if(u4_valid_intra_modes & 8)/* DIAG_DL */
628         {
629             shuffle_16x8b = _mm_setr_epi8( 7,  8,  9,  10,
630                                            8,  9,  10, 11,
631                                            9,  10, 11, 12,
632                                           10,  11, 12, 13);
633             pred3_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b);
634             sad_8x16b = _mm_sad_epu8(src_16x8b, pred3_16x8b);
635 
636             sad[DIAG_DL_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
637             cost[DIAG_DL_I4x4] = sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ? u4_lambda: lambda4);
638         }
639 
640         if(u4_valid_intra_modes & 16)/* DIAG_DR */
641         {
642             shuffle_16x8b = _mm_setr_epi8(5, 6, 7, 8,
643                                           4, 5, 6, 7,
644                                           3, 4, 5, 6,
645                                           2, 3, 4, 5);
646             pred4_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b);
647             sad_8x16b = _mm_sad_epu8(src_16x8b, pred4_16x8b);
648 
649             sad[DIAG_DR_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
650             cost[DIAG_DR_I4x4] = sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ? u4_lambda: lambda4);
651         }
652 
653         if(u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
654         {
655             temp1_16x8b = _mm_srli_si128(w121_16x8b, 1);
656             temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, w11_16x8b);
657             shuffle_16x8b = _mm_setr_epi8(12, 13, 14, 15,
658                                            4,  5,  6,  7,
659                                            3, 12, 13, 14,
660                                            2,  4,  5,  6);
661             pred5_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
662             sad_8x16b = _mm_sad_epu8(src_16x8b, pred5_16x8b);
663 
664             sad[VERT_R_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
665             cost[VERT_R_I4x4] = sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ? u4_lambda: lambda4);
666         }
667 
668         if(u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
669         {
670             temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b);
671             shuffle_16x8b = _mm_setr_epi8(11, 5,  6, 7,
672                                           10, 4, 11, 5,
673                                            9, 3, 10, 4,
674                                            8, 2,  9, 3);
675             pred6_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
676             sad_8x16b = _mm_sad_epu8(src_16x8b, pred6_16x8b);
677 
678             sad[HORZ_D_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
679             cost[HORZ_D_I4x4] = sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ? u4_lambda: lambda4);
680         }
681 
682         if(u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
683         {
684             temp1_16x8b = _mm_srli_si128(w121_16x8b, 5);
685             temp2_16x8b = _mm_srli_si128(w11_16x8b, 5);
686             temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, temp2_16x8b);
687             shuffle_16x8b = _mm_setr_epi8(8,  9, 10, 11,
688                                           2,  3,  4,  5,
689                                           9, 10, 11, 12,
690                                           3,  4,  5,  6);
691             pred7_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
692             sad_8x16b = _mm_sad_epu8(src_16x8b, pred7_16x8b);
693 
694             sad[VERT_L_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
695             cost[VERT_L_I4x4] = sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ? u4_lambda: lambda4);
696         }
697 
698         if(u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
699         {
700             temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b);
701             shuffle_16x8b = _mm_setr_epi8(10, 3, 9, 2,
702                                            9, 2, 8, 1,
703                                            8, 1, 0, 0,
704                                            0, 0, 0, 0);
705             pred8_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
706             sad_8x16b = _mm_sad_epu8(src_16x8b, pred8_16x8b);
707 
708             sad[HORZ_U_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
709             cost[HORZ_U_I4x4] = sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ? u4_lambda: lambda4);
710         }
711 
712         min_cost = MIN3(MIN3(cost[0], cost[1], cost[2]),
713                         MIN3(cost[3], cost[4], cost[5]),
714                         MIN3(cost[6], cost[7], cost[8]));
715     }
716     else
717     {  /*Only first three modes valid*/
718         min_cost = MIN3(cost[0], cost[1], cost[2]);
719     }
720 
721     *pu4_sadmin = min_cost;
722 
723     if(min_cost == cost[0])
724     {
725         *u4_intra_mode = VERT_I4x4;
726     }
727     else if(min_cost == cost[1])
728     {
729         *u4_intra_mode = HORZ_I4x4;
730         pred0_16x8b = pred1_16x8b;
731     }
732     else if(min_cost == cost[2])
733     {
734         *u4_intra_mode = DC_I4x4;
735         pred0_16x8b = pred2_16x8b;
736     }
737     else if(min_cost == cost[3])
738     {
739         *u4_intra_mode = DIAG_DL_I4x4;
740         pred0_16x8b = pred3_16x8b;
741     }
742     else if(min_cost == cost[4])
743     {
744         *u4_intra_mode = DIAG_DR_I4x4;
745         pred0_16x8b = pred4_16x8b;
746     }
747     else if(min_cost == cost[5])
748     {
749         *u4_intra_mode = VERT_R_I4x4;
750         pred0_16x8b = pred5_16x8b;
751     }
752     else if(min_cost == cost[6])
753     {
754         *u4_intra_mode = HORZ_D_I4x4;
755         pred0_16x8b = pred6_16x8b;
756     }
757     else if(min_cost == cost[7])
758     {
759         *u4_intra_mode = VERT_L_I4x4;
760         pred0_16x8b = pred7_16x8b;
761     }
762     else if(min_cost == cost[8])
763     {
764         *u4_intra_mode = HORZ_U_I4x4;
765         pred0_16x8b = pred8_16x8b;
766     }
767 
768     mask_low_32b = _mm_set1_epi8(0xff);
769     mask_low_32b = _mm_srli_si128(mask_low_32b, 12);
770 
771     _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)pu1_dst);
772     pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4);
773     _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
774     pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4);
775     _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
776     pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4);
777     _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
778 
779 }
780 
781 /**
782 ******************************************************************************
783 *
784 * @brief
785 *  Evaluate best intra chroma mode (among VERT, HORZ and DC) and do the prediction.
786 *
787 * @par Description
788 *  This function evaluates first three intra chroma modes and compute corresponding sad
789 *  and return the buffer predicted with best mode.
790 *
791 * @param[in] pu1_src
792 *  UWORD8 pointer to the source
793 *
794 ** @param[in] pu1_ngbr_pels
795 *  UWORD8 pointer to neighbouring pels
796 *
797 * @param[out] pu1_dst
798 *  UWORD8 pointer to the destination
799 *
800 * @param[in] src_strd
801 *  integer source stride
802 *
803 * @param[in] dst_strd
804 *  integer destination stride
805 *
806 * @param[in] u4_n_avblty
807 *  availability of neighbouring pixels
808 *
809 * @param[in] u4_intra_mode
810 *  pointer to the variable in which best mode is returned
811 *
812 * @param[in] pu4_sadmin
813 *  pointer to the variable in which minimum sad is returned
814 *
815 * @param[in] u4_valid_intra_modes
816 *  says what all modes are valid
817 *
818 * @return
819 *  none
820 *
821 ******************************************************************************
822 */
823 
ih264e_evaluate_intra_chroma_modes_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)824 void ih264e_evaluate_intra_chroma_modes_ssse3(UWORD8 *pu1_src,
825                                               UWORD8 *pu1_ngbr_pels,
826                                               UWORD8 *pu1_dst,
827                                               UWORD32 src_strd,
828                                               UWORD32 dst_strd,
829                                               WORD32 u4_n_avblty,
830                                               UWORD32 *u4_intra_mode,
831                                               WORD32 *pu4_sadmin,
832                                               UWORD32 u4_valid_intra_modes)
833 {
834     WORD32 left, top;
835     WORD32 sad_vert = INT_MAX, sad_horz = INT_MAX, sad_dc = INT_MAX, min_sad;
836 
837     __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b;
838     __m128i src5_16x8b, src6_16x8b, src7_16x8b, src8_16x8b;
839 
840     __m128i top_16x8b, left_16x8b;
841     __m128i pred1_16x8b, pred2_16x8b;
842     __m128i tmp1_8x16b, tmp2_8x16b, sad_8x16b;
843 
844     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
845     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
846 
847     //Loading source
848     {
849         src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
850         pu1_src += src_strd;
851         src2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
852         pu1_src += src_strd;
853         src3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
854         pu1_src += src_strd;
855         src4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
856         pu1_src += src_strd;
857         src5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
858         pu1_src += src_strd;
859         src6_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
860         pu1_src += src_strd;
861         src7_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
862         pu1_src += src_strd;
863         src8_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
864     }
865 
866     if(left)
867     {
868         left_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels);
869 
870         if(u4_valid_intra_modes & 02) //If HORZ mode is valid
871         {
872             __m128i left_tmp_16x8b, left_sh_16x8b;
873             __m128i const_14_15_16x8b;
874 
875             const_14_15_16x8b = _mm_set1_epi16(0x0f0e);
876             left_sh_16x8b = _mm_slli_si128(left_16x8b, 2);
877 
878             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 1
879             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2
880             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
881             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred2_16x8b);
882 
883             left_tmp_16x8b = _mm_slli_si128(left_16x8b, 4);
884             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
885             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
886 
887             pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 3
888             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);  //row 4
889             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
890             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred2_16x8b);
891 
892             left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4);
893             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
894             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
895             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
896 
897             pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 5
898             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);  //row 6
899             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b);
900             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b);
901 
902             left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4);
903             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
904             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
905             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
906 
907             pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 7
908             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);  //row 8
909             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b);
910             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b);
911 
912             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
913             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
914 
915             sad_horz = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
916         }
917     }
918 
919     if(top)
920     {
921         UWORD8 *pu1_top;
922 
923         pu1_top = pu1_ngbr_pels + 2 * BLK8x8SIZE + 2;
924         top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
925 
926         if(u4_valid_intra_modes & 04) //If VERT mode is valid
927         {
928             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, top_16x8b);
929             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, top_16x8b);
930             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
931 
932             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, top_16x8b);
933             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, top_16x8b);
934             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
935             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
936 
937             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, top_16x8b);
938             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, top_16x8b);
939             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
940             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
941 
942             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, top_16x8b);
943             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, top_16x8b);
944             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
945             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
946 
947             sad_vert = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
948         }
949     }
950 
951     if(u4_valid_intra_modes & 01) //If DC mode is valid
952     {
953         if(left && top)
954         {
955             WORD32 left_up_u, left_down_u, left_up_v, left_down_v;
956             WORD32 top_left_u, top_right_u, top_left_v, top_right_v;
957             WORD32 dc_1u, dc_1v, dc_2u, dc_2v;
958 
959             __m128i val_sh_16x8b;
960             __m128i intrlv_mask_8x16b, zero_vector;
961 
962             intrlv_mask_8x16b = _mm_set1_epi16(0x00ff);
963             zero_vector = _mm_setzero_si128();
964 
965             val_sh_16x8b = _mm_srli_si128(left_16x8b, 1);
966 
967             tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b);
968             tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b);
969             tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
970             tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
971 
972             left_up_u = _mm_extract_epi16(tmp1_8x16b, 4);
973             left_up_v = _mm_extract_epi16(tmp2_8x16b, 4);
974             left_down_u = _mm_extract_epi16(tmp1_8x16b, 0);
975             left_down_v = _mm_extract_epi16(tmp2_8x16b, 0);
976 
977             val_sh_16x8b = _mm_srli_si128(top_16x8b, 1);
978 
979             tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b);
980             tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b);
981             tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
982             tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
983 
984             top_left_u = _mm_extract_epi16(tmp1_8x16b, 0);
985             top_left_v = _mm_extract_epi16(tmp2_8x16b, 0);
986             top_right_u = _mm_extract_epi16(tmp1_8x16b, 4);
987             top_right_v = _mm_extract_epi16(tmp2_8x16b, 4);
988 
989             // First four rows
990             dc_1u = (left_up_u + top_left_u + 4) >> 3;
991             dc_1v = (left_up_v + top_left_v + 4) >> 3;
992             dc_2u = (top_right_u + 2) >> 2;
993             dc_2v = (top_right_v + 2) >> 2;
994 
995             pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v,
996                                         dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v);
997 
998             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
999             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
1000             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
1001 
1002             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
1003             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
1004             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1005             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1006 
1007             // Second four rows
1008             dc_1u = (left_down_u + 2) >> 2;
1009             dc_1v = (left_down_v + 2) >> 2;
1010             dc_2u = (left_down_u + top_right_u + 4) >> 3;
1011             dc_2v = (left_down_v + top_right_v + 4) >> 3;
1012 
1013             pred2_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v,
1014                                         dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v);
1015 
1016             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b);
1017             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b);
1018             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1019             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1020 
1021             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b);
1022             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b);
1023             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1024             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1025 
1026             sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
1027         }
1028         else if(left)
1029         {
1030             WORD32 left_up_u, left_down_u, left_up_v, left_down_v;
1031             WORD32 dc_u, dc_v;
1032 
1033             __m128i left_sh_16x8b;
1034             __m128i intrlv_mask_8x16b, zero_vector;
1035 
1036             intrlv_mask_8x16b = _mm_set1_epi16(0x00ff);
1037             zero_vector = _mm_setzero_si128();
1038 
1039             left_sh_16x8b = _mm_srli_si128(left_16x8b, 1);
1040 
1041             tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b);
1042             tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_sh_16x8b);
1043             tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
1044             tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
1045 
1046             left_up_u = _mm_extract_epi16(tmp1_8x16b, 4);
1047             left_up_v = _mm_extract_epi16(tmp2_8x16b, 4);
1048             left_down_u = _mm_extract_epi16(tmp1_8x16b, 0);
1049             left_down_v = _mm_extract_epi16(tmp2_8x16b, 0);
1050 
1051             // First four rows
1052             dc_u = (left_up_u + 2) >> 2;
1053             dc_v = (left_up_v + 2) >> 2;
1054 
1055             pred1_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8));
1056 
1057             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
1058             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
1059             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
1060 
1061             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
1062             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
1063             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1064             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1065 
1066             // Second four rows
1067             dc_u = (left_down_u + 2) >> 2;
1068             dc_v = (left_down_v + 2) >> 2;
1069 
1070             pred2_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8));
1071 
1072             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b);
1073             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b);
1074             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1075             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1076 
1077             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b);
1078             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b);
1079             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1080             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1081 
1082             sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
1083         }
1084         else if(top)
1085         {
1086             WORD32 top_left_u, top_right_u, top_left_v, top_right_v;
1087             WORD32 dc_1u, dc_1v, dc_2u, dc_2v;
1088 
1089             __m128i top_sh_16x8b;
1090             __m128i intrlv_mask_8x16b, zero_vector;
1091 
1092             intrlv_mask_8x16b = _mm_set1_epi16(0x00ff);
1093             zero_vector = _mm_setzero_si128();
1094 
1095             top_sh_16x8b = _mm_srli_si128(top_16x8b, 1);
1096 
1097             tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b);
1098             tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_sh_16x8b);
1099             tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
1100             tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
1101 
1102             top_left_u = _mm_extract_epi16(tmp1_8x16b, 0);
1103             top_left_v = _mm_extract_epi16(tmp2_8x16b, 0);
1104             top_right_u = _mm_extract_epi16(tmp1_8x16b, 4);
1105             top_right_v = _mm_extract_epi16(tmp2_8x16b, 4);
1106 
1107             dc_1u = (top_left_u + 2) >> 2;
1108             dc_1v = (top_left_v + 2) >> 2;
1109             dc_2u = (top_right_u + 2) >> 2;
1110             dc_2v = (top_right_v + 2) >> 2;
1111 
1112             pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v,
1113                                        dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v);
1114 
1115             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
1116             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
1117             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
1118 
1119             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
1120             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
1121             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1122             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1123 
1124             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b);
1125             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b);
1126             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1127             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1128 
1129             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b);
1130             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b);
1131             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1132             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1133 
1134             sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
1135         }
1136         else
1137         {
1138             pred1_16x8b = _mm_set1_epi8(128);
1139 
1140             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
1141             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
1142             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
1143 
1144             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
1145             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
1146             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1147             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1148 
1149             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b);
1150             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b);
1151             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1152             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1153 
1154             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b);
1155             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b);
1156             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1157             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1158 
1159             sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
1160         }
1161     }
1162 
1163     min_sad = MIN3(sad_horz, sad_vert, sad_dc);
1164 
1165     /* Finding minimum SAD and doing corresponding prediction*/
1166     if(min_sad < *pu4_sadmin)
1167     {
1168         *pu4_sadmin = min_sad;
1169 
1170         if(min_sad == sad_dc)
1171         {
1172             *u4_intra_mode = DC_CH_I8x8;
1173 
1174             if(!left)
1175                 pred2_16x8b = pred1_16x8b;
1176 
1177             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1178             pu1_dst += dst_strd;
1179             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1180             pu1_dst += dst_strd;
1181             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1182             pu1_dst += dst_strd;
1183             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1184             pu1_dst += dst_strd;
1185 
1186             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1187             pu1_dst += dst_strd;
1188             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1189             pu1_dst += dst_strd;
1190             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1191             pu1_dst += dst_strd;
1192             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1193         }
1194         else if(min_sad == sad_horz)
1195         {
1196             __m128i left_sh_16x8b, const_14_15_16x8b;
1197 
1198             *u4_intra_mode = HORZ_CH_I8x8;
1199 
1200             const_14_15_16x8b = _mm_set1_epi16(0x0f0e);
1201 
1202             left_sh_16x8b = _mm_slli_si128(left_16x8b, 2);
1203             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 1
1204             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2
1205 
1206             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1207             pu1_dst += dst_strd;
1208             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1209 
1210             left_16x8b = _mm_slli_si128(left_16x8b, 4);
1211             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
1212             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 3
1213             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 4
1214 
1215             pu1_dst += dst_strd;
1216             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1217             pu1_dst += dst_strd;
1218             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1219 
1220             left_16x8b = _mm_slli_si128(left_16x8b, 4);
1221             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
1222             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 5
1223             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 6
1224 
1225             pu1_dst += dst_strd;
1226             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1227             pu1_dst += dst_strd;
1228             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1229 
1230             left_16x8b = _mm_slli_si128(left_16x8b, 4);
1231             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
1232             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 7
1233             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 8
1234 
1235             pu1_dst += dst_strd;
1236             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1237             pu1_dst += dst_strd;
1238             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1239         }
1240         else
1241         {
1242             *u4_intra_mode = VERT_CH_I8x8;
1243 
1244             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1245             pu1_dst += dst_strd;
1246             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1247             pu1_dst += dst_strd;
1248             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1249             pu1_dst += dst_strd;
1250             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1251             pu1_dst += dst_strd;
1252             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1253             pu1_dst += dst_strd;
1254             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1255             pu1_dst += dst_strd;
1256             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1257             pu1_dst += dst_strd;
1258             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1259         }
1260     }
1261 }
1262