1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ih264e_intra_modes_eval_ssse3.c
24 *
25 * @brief
26 * This file contains definitions of routines that perform rate distortion
27 * analysis on a macroblock if they are to be coded as intra.
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * ih264e_evaluate_intra16x16_modes_ssse3
34 * ih264e_evaluate_intra_4x4_modes_ssse3
35 * ih264e_evaluate_intra_chroma_modes_ssse3
36 *
37 * @remarks
38 * None
39 *
40 *******************************************************************************
41 */
42
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46
47 /* System include files */
48 #include <stdio.h>
49 #include <string.h>
50 #include <limits.h>
51 #include <assert.h>
52 #include <immintrin.h>
53
54 /* User include files */
55 #include "ih264e_config.h"
56 #include "ih264_typedefs.h"
57 #include "ih264e_defs.h"
58 #include "iv2.h"
59 #include "ive2.h"
60 #include "ih264_debug.h"
61 #include "ih264_defs.h"
62 #include "ih264_macros.h"
63 #include "ih264_intra_pred_filters.h"
64 #include "ih264_structs.h"
65 #include "ih264_common_tables.h"
66 #include "ih264_trans_quant_itrans_iquant.h"
67 #include "ih264_inter_pred_filters.h"
68 #include "ih264_mem_fns.h"
69 #include "ih264_padding.h"
70 #include "ih264_deblk_edge_filters.h"
71 #include "ime_distortion_metrics.h"
72 #include "ih264e_error.h"
73 #include "ih264e_bitstream.h"
74 #include "ime_defs.h"
75 #include "ime_structs.h"
76 #include "ih264_cabac_tables.h"
77 #include "irc_cntrl_param.h"
78 #include "irc_frame_info_collector.h"
79 #include "ih264e_rate_control.h"
80
81 #include "ih264e_cabac_structs.h"
82 #include "ih264e_structs.h"
83 #include "ih264e_cabac.h"
84 #include "ih264e_intra_modes_eval.h"
85 #include "ih264e_globals.h"
86 #include "ime_platform_macros.h"
87
88
89 /*****************************************************************************/
90 /* Function Definitions */
91 /*****************************************************************************/
92 /**
93 ******************************************************************************
94 *
95 * @brief
96 * evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
97 * prediction.
98 *
99 * @par Description
100 * This function evaluates first three 16x16 modes and compute corresponding
101 * SAD and returns the buffer predicted with best mode.
102 *
103 * @param[in] pu1_src
104 * UWORD8 pointer to the source
105 *
106 * @param[in] pu1_ngbr_pels_i16
107 * UWORD8 pointer to neighbouring pels
108 *
109 * @param[out] pu1_dst
110 * UWORD8 pointer to the destination
111 *
112 * @param[in] src_strd
113 * integer source stride
114 *
115 * @param[in] dst_strd
116 * integer destination stride
117 *
118 * @param[in] u4_n_avblty
119 * availability of neighbouring pixels
120 *
121 * @param[in] u4_intra_mode
122 * pointer to the variable in which best mode is returned
123 *
124 * @param[in] pu4_sadmin
125 * pointer to the variable in which minimum sad is returned
126 *
127 * @param[in] u4_valid_intra_modes
128 * says what all modes are valid
129 *
130 * @return
131 * None
132 *
133 ******************************************************************************
134 */
ih264e_evaluate_intra16x16_modes_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels_i16,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)135 void ih264e_evaluate_intra16x16_modes_ssse3(UWORD8 *pu1_src,
136 UWORD8 *pu1_ngbr_pels_i16,
137 UWORD8 *pu1_dst,
138 UWORD32 src_strd,
139 UWORD32 dst_strd,
140 WORD32 n_avblty,
141 UWORD32 *u4_intra_mode,
142 WORD32 *pu4_sadmin,
143 UWORD32 u4_valid_intra_modes)
144 {
145 UWORD8 *pu1_src_temp;
146
147 WORD32 left, top, horz_flag, vert_flag, dc_flag;
148 WORD32 sad_vert, sad_horz, sad_dc, min_sad;
149
150 WORD32 cnt, dcval;
151 WORD32 src_strd2, src_strd3, src_strd4;
152 WORD32 dst_strd2, dst_strd3, dst_strd4;
153
154 __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b;
155 __m128i val1_16x8b, val2_16x8b, val3_16x8b, val4_16x8b;
156 __m128i sad1_8x16b, sad2_8x16b, sad3_8x16b, sad4_8x16b;
157
158 __m128i sad_8x16b, val_16x8b, zero_vector;
159
160 sad_vert = INT_MAX;
161 sad_horz = INT_MAX;
162 sad_dc = INT_MAX;
163
164 src_strd2 = src_strd << 1;
165 src_strd4 = src_strd << 2;
166 src_strd3 = src_strd + src_strd2;
167
168 dst_strd2 = dst_strd << 1;
169 dst_strd4 = dst_strd << 2;
170 dst_strd3 = dst_strd + dst_strd2;
171
172 left = (n_avblty & LEFT_MB_AVAILABLE_MASK);
173 top = (n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
174
175 zero_vector = _mm_setzero_si128();
176
177 horz_flag = left && ((u4_valid_intra_modes & 02) != 0);
178 vert_flag = top && ((u4_valid_intra_modes & 01) != 0);
179 dc_flag = (u4_valid_intra_modes & 04) != 0;
180
181 if(horz_flag)
182 {
183 pu1_src_temp = pu1_src;
184
185 val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[15]);
186 val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[14]);
187 val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[13]);
188 val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[12]);
189
190 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
191 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
192 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
193 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
194
195 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
196 sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b);
197 sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b);
198 sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b);
199
200 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
201 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
202
203 cnt = 11;
204 sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
205 do
206 {
207 pu1_src_temp += src_strd4;
208
209 val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]);
210 val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]);
211 val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]);
212 val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]);
213
214 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
215 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
216 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
217 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
218
219 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
220 sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b);
221 sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b);
222 sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b);
223
224 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
225 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
226 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
227
228 cnt -= 4;
229 sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b);
230 }
231 while(cnt >= 0);
232
233 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
234 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
235 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
236
237 sad_horz = _mm_extract_epi16(sad_8x16b, 0);
238 }
239
240 if(vert_flag)
241 {
242 pu1_src_temp = pu1_src;
243
244 val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17));
245
246 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
247 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
248 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
249 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
250
251 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
252 sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
253 sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
254 sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
255
256 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
257 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
258
259 cnt = 11;
260 sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
261 do
262 {
263 pu1_src_temp += src_strd4;
264
265 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
266 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
267 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
268 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
269
270 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
271 sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
272 sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
273 sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
274
275 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
276 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
277 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
278
279 cnt -= 4;
280 sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b);
281 }
282 while(cnt >= 0);
283
284 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
285 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
286 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
287
288 sad_vert = _mm_extract_epi16(sad_8x16b, 0);
289 }
290
291 dcval = 0;
292
293 if(left)
294 {
295 val_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels_i16);
296 dcval += 8;
297
298 sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
299 dcval += _mm_extract_epi16(sad1_8x16b, 0);
300 dcval += _mm_extract_epi16(sad1_8x16b, 4);
301 }
302 if(top)
303 {
304 val_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17));
305 dcval += 8;
306
307 sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
308 dcval += _mm_extract_epi16(sad1_8x16b, 0);
309 dcval += _mm_extract_epi16(sad1_8x16b, 4);
310 }
311 dcval = dcval >> (3 + left + top);
312 dcval += ((left == 0) & (top == 0)) << 7;
313
314 if(dc_flag)
315 {
316 pu1_src_temp = pu1_src;
317 val1_16x8b = _mm_set1_epi8(dcval);
318
319 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
320 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
321 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
322 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
323
324 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
325 sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
326 sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
327 sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
328
329 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
330 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
331
332 cnt = 12;
333 sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
334 do
335 {
336 pu1_src_temp += src_strd4;
337
338 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
339 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
340 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
341 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
342
343 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
344 sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
345 sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
346 sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
347
348 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
349 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
350 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
351
352 cnt -= 4;
353 sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b);
354 }
355 while(cnt > 0);
356
357 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
358 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
359 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
360
361 sad_dc = _mm_extract_epi16(sad_8x16b, 0);
362 }
363
364 // Doing prediction for minimum SAD
365 min_sad = MIN3(sad_horz, sad_vert, sad_dc);
366 if(min_sad < *pu4_sadmin)
367 {
368 *pu4_sadmin = min_sad;
369 if(min_sad == sad_vert)
370 {
371 *u4_intra_mode = VERT_I16x16;
372 val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17));
373 cnt = 15;
374 do
375 {
376 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b);
377 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b);
378 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b);
379 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b);
380
381 cnt -= 4;
382 pu1_dst += dst_strd4;
383 }
384 while(cnt > 0);
385 }
386 else if(min_sad == sad_horz)
387 {
388 *u4_intra_mode = HORZ_I16x16;
389 cnt = 15;
390 do
391 {
392 val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]);
393 val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]);
394 val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]);
395 val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]);
396
397 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b);
398 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val2_16x8b);
399 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val3_16x8b);
400 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val4_16x8b);
401
402 cnt -= 4;
403 pu1_dst += dst_strd4;
404 }
405 while(cnt >= 0);
406 }
407 else
408 {
409 *u4_intra_mode = DC_I16x16;
410 val1_16x8b = _mm_set1_epi8(dcval);
411 cnt = 15;
412 do
413 {
414 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b);
415 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b);
416 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b);
417 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b);
418
419 cnt -= 4;
420 pu1_dst += dst_strd4;
421 }
422 while(cnt > 0);
423 }
424 }
425 }
426
427 /**
428 ******************************************************************************
429 *
430 * @brief :Evaluate best intra 4x4 mode and do the prediction.
431 *
432 * @par Description
433 * This function evaluates intra 4x4 modes, computes corresponding sad
434 * and returns the buffer predicted with best mode.
435 *
436 * @param[in] pu1_src
437 * UWORD8 pointer to the source
438 *
439 ** @param[in] pu1_ngbr_pels
440 * UWORD8 pointer to neighbouring pels
441 *
442 * @param[out] pu1_dst
443 * UWORD8 pointer to the destination
444 *
445 * @param[in] src_strd
446 * integer source stride
447 *
448 * @param[in] dst_strd
449 * integer destination stride
450 *
451 * @param[in] u4_n_avblty
452 * availability of neighbouring pixels
453 *
454 * @param[in] u4_intra_mode
455 * Pointer to the variable in which best mode is returned
456 *
457 * @param[in] pu4_sadmin
458 * Pointer to the variable in which minimum cost is returned
459 *
460 * @param[in] u4_valid_intra_modes
461 * Says what all modes are valid
462 *
463 * * @param[in] u4_lambda
464 * Lamda value for computing cost from SAD
465 *
466 * @param[in] u4_predictd_mode
467 * Predicted mode for cost computation
468 *
469 * @return none
470 *
471 ******************************************************************************
472 */
ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes,UWORD32 u4_lambda,UWORD32 u4_predictd_mode)473 void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src,
474 UWORD8 *pu1_ngbr_pels,
475 UWORD8 *pu1_dst,
476 UWORD32 src_strd,
477 UWORD32 dst_strd,
478 WORD32 u4_n_avblty,
479 UWORD32 *u4_intra_mode,
480 WORD32 *pu4_sadmin,
481 UWORD32 u4_valid_intra_modes,
482 UWORD32 u4_lambda,
483 UWORD32 u4_predictd_mode)
484 {
485 WORD32 left, top;
486 WORD32 sad[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
487 INT_MAX, INT_MAX, INT_MAX, INT_MAX };
488 WORD32 cost[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
489 INT_MAX, INT_MAX, INT_MAX, INT_MAX };
490
491 WORD32 min_cost;
492 UWORD32 lambda4 = u4_lambda << 2;
493 WORD32 dst_strd2, dst_strd3;
494
495 __m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b;
496 __m128i pred1_16x8b, pred2_16x8b, pred3_16x8b, pred4_16x8b;
497 __m128i pred5_16x8b, pred6_16x8b, pred7_16x8b, pred8_16x8b;
498 __m128i shuffle_16x8b, zero_vector, mask_low_32b;
499
500 left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
501 top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
502
503 dst_strd2 = dst_strd << 1;
504 dst_strd3 = dst_strd + dst_strd2;
505
506 // loading the 4x4 source block and neighbouring pixels
507 {
508 __m128i row1_16x8b, row2_16x8b;
509
510 row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
511 row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
512 left_top_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels);
513
514 pu1_src += src_strd << 1;
515 src_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b);
516
517 row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
518 row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
519 zero_vector = _mm_setzero_si128();
520
521 row1_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b);
522 src_16x8b = _mm_unpacklo_epi64(src_16x8b, row1_16x8b);
523 }
524
525 /* Computing SADs*/
526 if(u4_valid_intra_modes & 1)/* VERT mode valid ????*/
527 {
528 pred0_16x8b = _mm_srli_si128(left_top_16x8b, 5);
529 pred0_16x8b = _mm_shuffle_epi32(pred0_16x8b, 0);
530 sad_8x16b = _mm_sad_epu8(src_16x8b, pred0_16x8b);
531
532 sad[VERT_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
533 cost[VERT_I4x4] = sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ? u4_lambda: lambda4);
534 }
535
536 if(u4_valid_intra_modes & 2)/* HORZ mode valid ????*/
537 {
538 shuffle_16x8b = _mm_setr_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
539 pred1_16x8b = _mm_shuffle_epi8(left_top_16x8b, shuffle_16x8b);
540
541 sad_8x16b = _mm_sad_epu8(src_16x8b, pred1_16x8b);
542
543 sad[HORZ_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
544 cost[HORZ_I4x4] = sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ? u4_lambda: lambda4);
545 }
546
547 if(u4_valid_intra_modes & 4)/* DC mode valid ????*/
548 {
549 if(top + left)
550 {
551 WORD32 shft = 1, dcval = 0;
552
553 __m128i val_16x8b, temp_16x8b, temp_8x16b;
554
555 val_16x8b = _mm_setzero_si128();
556
557 if(top)
558 {
559 temp_16x8b = _mm_srli_si128(left_top_16x8b, 5);
560 val_16x8b = _mm_alignr_epi8(temp_16x8b, val_16x8b, 4);
561 shft ++;
562 dcval += 2;
563 }
564 if(left)
565 {
566 val_16x8b = _mm_alignr_epi8(left_top_16x8b, val_16x8b, 4);
567 shft++;
568 dcval += 2;
569 }
570
571 temp_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
572 dcval += _mm_extract_epi16(temp_8x16b, 4);
573 dcval = dcval >> shft;
574 pred2_16x8b = _mm_set1_epi8(dcval);
575 }
576 else
577 pred2_16x8b = _mm_set1_epi8(128);
578
579 sad_8x16b = _mm_sad_epu8(src_16x8b, pred2_16x8b);
580
581 sad[DC_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
582 cost[DC_I4x4] = sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ? u4_lambda: lambda4);
583 }
584
585 if(u4_valid_intra_modes > 7)/* if modes other than VERT, HORZ and DC are valid ????*/
586 {
587 __m128i w11_16x8b, w121_16x8b;
588 __m128i temp1_16x8b, temp2_16x8b;
589
590 /* Performing FILT121 and FILT11 operation for all neighbour values*/
591 {
592 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b;
593 __m128i const_2_8x16b;
594
595 const_2_8x16b = _mm_set1_epi16(2);
596
597 temp1_8x16b = _mm_unpacklo_epi8(left_top_16x8b, zero_vector); //l3 l2 l1 l0 tl t0 t1 t2
598 temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2); // 0 l3 l2 l1 l0 tl t0 t1
599 temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5); //l3 l3 l2 l1 l0 tl t0 t1
600
601 temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b); //l3+l3 l3+l2 l2+l1... t1+t2
602 temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2); //l3+l3 l3+l3 l3+l2... t0+t1
603 temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5);
604 temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b); //4*l3 l3+2*l3+l2 l3+2*l2+l1... t0+2*t1+t2
605
606 temp1_8x16b = _mm_add_epi16(const_2_8x16b, temp1_8x16b); //4*l3+2 3*l3+l2+2 l3+2*l2+l1+2.. t0+2*t1+t2+2
607 temp1_8x16b = _mm_srli_epi16(temp1_8x16b, 2);
608
609 temp1_16x8b = _mm_srli_si128(left_top_16x8b, 1);
610 w11_16x8b = _mm_avg_epu8(left_top_16x8b, temp1_16x8b);
611
612 temp2_16x8b = _mm_srli_si128(left_top_16x8b, 6);
613 temp2_8x16b = _mm_unpacklo_epi8(temp2_16x8b, zero_vector); //t1 t2 t3 t4 t5 t6 t7 0
614 temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2); //t2 t3 t4 t5 t6 t7 0 0
615 temp3_8x16b = _mm_shufflehi_epi16(temp3_8x16b, 0xd4); //t2 t3 t4 t5 t6 t7 t7 0
616
617 temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b); //t1+t2 t2+t3... t6+t7 t7+t7 0
618 temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2); //t2+t3 t3+t4... t7+t7 0 0
619 temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b); //t1+2*t2+t3 t2+2*t3+t4.. t6+2*t7+t7 t7+t7 0
620
621 temp2_8x16b = _mm_add_epi16(const_2_8x16b, temp2_8x16b); //t1+2*t2+t3+2 t2+2*t3+t4+2 t3+2*t4+t5+2... t6+2*t7+t7+2 t7+t7+2 2
622 temp2_8x16b = _mm_srli_epi16(temp2_8x16b, 2);
623
624 w121_16x8b = _mm_packus_epi16(temp1_8x16b, temp2_8x16b);
625 }
626
627 if(u4_valid_intra_modes & 8)/* DIAG_DL */
628 {
629 shuffle_16x8b = _mm_setr_epi8( 7, 8, 9, 10,
630 8, 9, 10, 11,
631 9, 10, 11, 12,
632 10, 11, 12, 13);
633 pred3_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b);
634 sad_8x16b = _mm_sad_epu8(src_16x8b, pred3_16x8b);
635
636 sad[DIAG_DL_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
637 cost[DIAG_DL_I4x4] = sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ? u4_lambda: lambda4);
638 }
639
640 if(u4_valid_intra_modes & 16)/* DIAG_DR */
641 {
642 shuffle_16x8b = _mm_setr_epi8(5, 6, 7, 8,
643 4, 5, 6, 7,
644 3, 4, 5, 6,
645 2, 3, 4, 5);
646 pred4_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b);
647 sad_8x16b = _mm_sad_epu8(src_16x8b, pred4_16x8b);
648
649 sad[DIAG_DR_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
650 cost[DIAG_DR_I4x4] = sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ? u4_lambda: lambda4);
651 }
652
653 if(u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
654 {
655 temp1_16x8b = _mm_srli_si128(w121_16x8b, 1);
656 temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, w11_16x8b);
657 shuffle_16x8b = _mm_setr_epi8(12, 13, 14, 15,
658 4, 5, 6, 7,
659 3, 12, 13, 14,
660 2, 4, 5, 6);
661 pred5_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
662 sad_8x16b = _mm_sad_epu8(src_16x8b, pred5_16x8b);
663
664 sad[VERT_R_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
665 cost[VERT_R_I4x4] = sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ? u4_lambda: lambda4);
666 }
667
668 if(u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
669 {
670 temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b);
671 shuffle_16x8b = _mm_setr_epi8(11, 5, 6, 7,
672 10, 4, 11, 5,
673 9, 3, 10, 4,
674 8, 2, 9, 3);
675 pred6_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
676 sad_8x16b = _mm_sad_epu8(src_16x8b, pred6_16x8b);
677
678 sad[HORZ_D_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
679 cost[HORZ_D_I4x4] = sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ? u4_lambda: lambda4);
680 }
681
682 if(u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
683 {
684 temp1_16x8b = _mm_srli_si128(w121_16x8b, 5);
685 temp2_16x8b = _mm_srli_si128(w11_16x8b, 5);
686 temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, temp2_16x8b);
687 shuffle_16x8b = _mm_setr_epi8(8, 9, 10, 11,
688 2, 3, 4, 5,
689 9, 10, 11, 12,
690 3, 4, 5, 6);
691 pred7_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
692 sad_8x16b = _mm_sad_epu8(src_16x8b, pred7_16x8b);
693
694 sad[VERT_L_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
695 cost[VERT_L_I4x4] = sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ? u4_lambda: lambda4);
696 }
697
698 if(u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
699 {
700 temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b);
701 shuffle_16x8b = _mm_setr_epi8(10, 3, 9, 2,
702 9, 2, 8, 1,
703 8, 1, 0, 0,
704 0, 0, 0, 0);
705 pred8_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
706 sad_8x16b = _mm_sad_epu8(src_16x8b, pred8_16x8b);
707
708 sad[HORZ_U_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
709 cost[HORZ_U_I4x4] = sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ? u4_lambda: lambda4);
710 }
711
712 min_cost = MIN3(MIN3(cost[0], cost[1], cost[2]),
713 MIN3(cost[3], cost[4], cost[5]),
714 MIN3(cost[6], cost[7], cost[8]));
715 }
716 else
717 { /*Only first three modes valid*/
718 min_cost = MIN3(cost[0], cost[1], cost[2]);
719 }
720
721 *pu4_sadmin = min_cost;
722
723 if(min_cost == cost[0])
724 {
725 *u4_intra_mode = VERT_I4x4;
726 }
727 else if(min_cost == cost[1])
728 {
729 *u4_intra_mode = HORZ_I4x4;
730 pred0_16x8b = pred1_16x8b;
731 }
732 else if(min_cost == cost[2])
733 {
734 *u4_intra_mode = DC_I4x4;
735 pred0_16x8b = pred2_16x8b;
736 }
737 else if(min_cost == cost[3])
738 {
739 *u4_intra_mode = DIAG_DL_I4x4;
740 pred0_16x8b = pred3_16x8b;
741 }
742 else if(min_cost == cost[4])
743 {
744 *u4_intra_mode = DIAG_DR_I4x4;
745 pred0_16x8b = pred4_16x8b;
746 }
747 else if(min_cost == cost[5])
748 {
749 *u4_intra_mode = VERT_R_I4x4;
750 pred0_16x8b = pred5_16x8b;
751 }
752 else if(min_cost == cost[6])
753 {
754 *u4_intra_mode = HORZ_D_I4x4;
755 pred0_16x8b = pred6_16x8b;
756 }
757 else if(min_cost == cost[7])
758 {
759 *u4_intra_mode = VERT_L_I4x4;
760 pred0_16x8b = pred7_16x8b;
761 }
762 else if(min_cost == cost[8])
763 {
764 *u4_intra_mode = HORZ_U_I4x4;
765 pred0_16x8b = pred8_16x8b;
766 }
767
768 mask_low_32b = _mm_set1_epi8(0xff);
769 mask_low_32b = _mm_srli_si128(mask_low_32b, 12);
770
771 _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)pu1_dst);
772 pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4);
773 _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
774 pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4);
775 _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
776 pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4);
777 _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
778
779 }
780
781 /**
782 ******************************************************************************
783 *
784 * @brief
785 * Evaluate best intra chroma mode (among VERT, HORZ and DC) and do the prediction.
786 *
787 * @par Description
788 * This function evaluates first three intra chroma modes and compute corresponding sad
789 * and return the buffer predicted with best mode.
790 *
791 * @param[in] pu1_src
792 * UWORD8 pointer to the source
793 *
794 ** @param[in] pu1_ngbr_pels
795 * UWORD8 pointer to neighbouring pels
796 *
797 * @param[out] pu1_dst
798 * UWORD8 pointer to the destination
799 *
800 * @param[in] src_strd
801 * integer source stride
802 *
803 * @param[in] dst_strd
804 * integer destination stride
805 *
806 * @param[in] u4_n_avblty
807 * availability of neighbouring pixels
808 *
809 * @param[in] u4_intra_mode
810 * pointer to the variable in which best mode is returned
811 *
812 * @param[in] pu4_sadmin
813 * pointer to the variable in which minimum sad is returned
814 *
815 * @param[in] u4_valid_intra_modes
816 * says what all modes are valid
817 *
818 * @return
819 * none
820 *
821 ******************************************************************************
822 */
823
ih264e_evaluate_intra_chroma_modes_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)824 void ih264e_evaluate_intra_chroma_modes_ssse3(UWORD8 *pu1_src,
825 UWORD8 *pu1_ngbr_pels,
826 UWORD8 *pu1_dst,
827 UWORD32 src_strd,
828 UWORD32 dst_strd,
829 WORD32 u4_n_avblty,
830 UWORD32 *u4_intra_mode,
831 WORD32 *pu4_sadmin,
832 UWORD32 u4_valid_intra_modes)
833 {
834 WORD32 left, top;
835 WORD32 sad_vert = INT_MAX, sad_horz = INT_MAX, sad_dc = INT_MAX, min_sad;
836
837 __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b;
838 __m128i src5_16x8b, src6_16x8b, src7_16x8b, src8_16x8b;
839
840 __m128i top_16x8b, left_16x8b;
841 __m128i pred1_16x8b, pred2_16x8b;
842 __m128i tmp1_8x16b, tmp2_8x16b, sad_8x16b;
843
844 left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
845 top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
846
847 //Loading source
848 {
849 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
850 pu1_src += src_strd;
851 src2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
852 pu1_src += src_strd;
853 src3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
854 pu1_src += src_strd;
855 src4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
856 pu1_src += src_strd;
857 src5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
858 pu1_src += src_strd;
859 src6_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
860 pu1_src += src_strd;
861 src7_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
862 pu1_src += src_strd;
863 src8_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
864 }
865
866 if(left)
867 {
868 left_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels);
869
870 if(u4_valid_intra_modes & 02) //If HORZ mode is valid
871 {
872 __m128i left_tmp_16x8b, left_sh_16x8b;
873 __m128i const_14_15_16x8b;
874
875 const_14_15_16x8b = _mm_set1_epi16(0x0f0e);
876 left_sh_16x8b = _mm_slli_si128(left_16x8b, 2);
877
878 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 1
879 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2
880 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
881 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred2_16x8b);
882
883 left_tmp_16x8b = _mm_slli_si128(left_16x8b, 4);
884 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
885 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
886
887 pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 3
888 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 4
889 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
890 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred2_16x8b);
891
892 left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4);
893 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
894 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
895 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
896
897 pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 5
898 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 6
899 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b);
900 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b);
901
902 left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4);
903 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
904 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
905 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
906
907 pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 7
908 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 8
909 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b);
910 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b);
911
912 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
913 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
914
915 sad_horz = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
916 }
917 }
918
919 if(top)
920 {
921 UWORD8 *pu1_top;
922
923 pu1_top = pu1_ngbr_pels + 2 * BLK8x8SIZE + 2;
924 top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
925
926 if(u4_valid_intra_modes & 04) //If VERT mode is valid
927 {
928 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, top_16x8b);
929 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, top_16x8b);
930 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
931
932 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, top_16x8b);
933 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, top_16x8b);
934 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
935 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
936
937 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, top_16x8b);
938 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, top_16x8b);
939 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
940 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
941
942 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, top_16x8b);
943 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, top_16x8b);
944 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
945 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
946
947 sad_vert = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
948 }
949 }
950
951 if(u4_valid_intra_modes & 01) //If DC mode is valid
952 {
953 if(left && top)
954 {
955 WORD32 left_up_u, left_down_u, left_up_v, left_down_v;
956 WORD32 top_left_u, top_right_u, top_left_v, top_right_v;
957 WORD32 dc_1u, dc_1v, dc_2u, dc_2v;
958
959 __m128i val_sh_16x8b;
960 __m128i intrlv_mask_8x16b, zero_vector;
961
962 intrlv_mask_8x16b = _mm_set1_epi16(0x00ff);
963 zero_vector = _mm_setzero_si128();
964
965 val_sh_16x8b = _mm_srli_si128(left_16x8b, 1);
966
967 tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b);
968 tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b);
969 tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
970 tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
971
972 left_up_u = _mm_extract_epi16(tmp1_8x16b, 4);
973 left_up_v = _mm_extract_epi16(tmp2_8x16b, 4);
974 left_down_u = _mm_extract_epi16(tmp1_8x16b, 0);
975 left_down_v = _mm_extract_epi16(tmp2_8x16b, 0);
976
977 val_sh_16x8b = _mm_srli_si128(top_16x8b, 1);
978
979 tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b);
980 tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b);
981 tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
982 tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
983
984 top_left_u = _mm_extract_epi16(tmp1_8x16b, 0);
985 top_left_v = _mm_extract_epi16(tmp2_8x16b, 0);
986 top_right_u = _mm_extract_epi16(tmp1_8x16b, 4);
987 top_right_v = _mm_extract_epi16(tmp2_8x16b, 4);
988
989 // First four rows
990 dc_1u = (left_up_u + top_left_u + 4) >> 3;
991 dc_1v = (left_up_v + top_left_v + 4) >> 3;
992 dc_2u = (top_right_u + 2) >> 2;
993 dc_2v = (top_right_v + 2) >> 2;
994
995 pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v,
996 dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v);
997
998 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
999 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
1000 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
1001
1002 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
1003 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
1004 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1005 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1006
1007 // Second four rows
1008 dc_1u = (left_down_u + 2) >> 2;
1009 dc_1v = (left_down_v + 2) >> 2;
1010 dc_2u = (left_down_u + top_right_u + 4) >> 3;
1011 dc_2v = (left_down_v + top_right_v + 4) >> 3;
1012
1013 pred2_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v,
1014 dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v);
1015
1016 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b);
1017 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b);
1018 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1019 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1020
1021 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b);
1022 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b);
1023 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1024 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1025
1026 sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
1027 }
1028 else if(left)
1029 {
1030 WORD32 left_up_u, left_down_u, left_up_v, left_down_v;
1031 WORD32 dc_u, dc_v;
1032
1033 __m128i left_sh_16x8b;
1034 __m128i intrlv_mask_8x16b, zero_vector;
1035
1036 intrlv_mask_8x16b = _mm_set1_epi16(0x00ff);
1037 zero_vector = _mm_setzero_si128();
1038
1039 left_sh_16x8b = _mm_srli_si128(left_16x8b, 1);
1040
1041 tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b);
1042 tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_sh_16x8b);
1043 tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
1044 tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
1045
1046 left_up_u = _mm_extract_epi16(tmp1_8x16b, 4);
1047 left_up_v = _mm_extract_epi16(tmp2_8x16b, 4);
1048 left_down_u = _mm_extract_epi16(tmp1_8x16b, 0);
1049 left_down_v = _mm_extract_epi16(tmp2_8x16b, 0);
1050
1051 // First four rows
1052 dc_u = (left_up_u + 2) >> 2;
1053 dc_v = (left_up_v + 2) >> 2;
1054
1055 pred1_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8));
1056
1057 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
1058 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
1059 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
1060
1061 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
1062 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
1063 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1064 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1065
1066 // Second four rows
1067 dc_u = (left_down_u + 2) >> 2;
1068 dc_v = (left_down_v + 2) >> 2;
1069
1070 pred2_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8));
1071
1072 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b);
1073 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b);
1074 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1075 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1076
1077 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b);
1078 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b);
1079 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1080 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1081
1082 sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
1083 }
1084 else if(top)
1085 {
1086 WORD32 top_left_u, top_right_u, top_left_v, top_right_v;
1087 WORD32 dc_1u, dc_1v, dc_2u, dc_2v;
1088
1089 __m128i top_sh_16x8b;
1090 __m128i intrlv_mask_8x16b, zero_vector;
1091
1092 intrlv_mask_8x16b = _mm_set1_epi16(0x00ff);
1093 zero_vector = _mm_setzero_si128();
1094
1095 top_sh_16x8b = _mm_srli_si128(top_16x8b, 1);
1096
1097 tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b);
1098 tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_sh_16x8b);
1099 tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
1100 tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
1101
1102 top_left_u = _mm_extract_epi16(tmp1_8x16b, 0);
1103 top_left_v = _mm_extract_epi16(tmp2_8x16b, 0);
1104 top_right_u = _mm_extract_epi16(tmp1_8x16b, 4);
1105 top_right_v = _mm_extract_epi16(tmp2_8x16b, 4);
1106
1107 dc_1u = (top_left_u + 2) >> 2;
1108 dc_1v = (top_left_v + 2) >> 2;
1109 dc_2u = (top_right_u + 2) >> 2;
1110 dc_2v = (top_right_v + 2) >> 2;
1111
1112 pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v,
1113 dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v);
1114
1115 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
1116 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
1117 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
1118
1119 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
1120 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
1121 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1122 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1123
1124 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b);
1125 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b);
1126 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1127 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1128
1129 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b);
1130 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b);
1131 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1132 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1133
1134 sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
1135 }
1136 else
1137 {
1138 pred1_16x8b = _mm_set1_epi8(128);
1139
1140 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
1141 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
1142 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
1143
1144 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
1145 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
1146 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1147 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1148
1149 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b);
1150 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b);
1151 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1152 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1153
1154 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b);
1155 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b);
1156 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
1157 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
1158
1159 sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
1160 }
1161 }
1162
1163 min_sad = MIN3(sad_horz, sad_vert, sad_dc);
1164
1165 /* Finding minimum SAD and doing corresponding prediction*/
1166 if(min_sad < *pu4_sadmin)
1167 {
1168 *pu4_sadmin = min_sad;
1169
1170 if(min_sad == sad_dc)
1171 {
1172 *u4_intra_mode = DC_CH_I8x8;
1173
1174 if(!left)
1175 pred2_16x8b = pred1_16x8b;
1176
1177 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1178 pu1_dst += dst_strd;
1179 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1180 pu1_dst += dst_strd;
1181 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1182 pu1_dst += dst_strd;
1183 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1184 pu1_dst += dst_strd;
1185
1186 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1187 pu1_dst += dst_strd;
1188 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1189 pu1_dst += dst_strd;
1190 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1191 pu1_dst += dst_strd;
1192 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1193 }
1194 else if(min_sad == sad_horz)
1195 {
1196 __m128i left_sh_16x8b, const_14_15_16x8b;
1197
1198 *u4_intra_mode = HORZ_CH_I8x8;
1199
1200 const_14_15_16x8b = _mm_set1_epi16(0x0f0e);
1201
1202 left_sh_16x8b = _mm_slli_si128(left_16x8b, 2);
1203 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 1
1204 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2
1205
1206 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1207 pu1_dst += dst_strd;
1208 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1209
1210 left_16x8b = _mm_slli_si128(left_16x8b, 4);
1211 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
1212 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 3
1213 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 4
1214
1215 pu1_dst += dst_strd;
1216 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1217 pu1_dst += dst_strd;
1218 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1219
1220 left_16x8b = _mm_slli_si128(left_16x8b, 4);
1221 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
1222 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 5
1223 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 6
1224
1225 pu1_dst += dst_strd;
1226 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1227 pu1_dst += dst_strd;
1228 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1229
1230 left_16x8b = _mm_slli_si128(left_16x8b, 4);
1231 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
1232 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 7
1233 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 8
1234
1235 pu1_dst += dst_strd;
1236 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
1237 pu1_dst += dst_strd;
1238 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
1239 }
1240 else
1241 {
1242 *u4_intra_mode = VERT_CH_I8x8;
1243
1244 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1245 pu1_dst += dst_strd;
1246 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1247 pu1_dst += dst_strd;
1248 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1249 pu1_dst += dst_strd;
1250 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1251 pu1_dst += dst_strd;
1252 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1253 pu1_dst += dst_strd;
1254 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1255 pu1_dst += dst_strd;
1256 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1257 pu1_dst += dst_strd;
1258 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
1259 }
1260 }
1261 }
1262